Class: FAIRChampionHarvester::CommonQueries

Inherits:
Object
  • Object
show all
Defined in:
lib/common_queries.rb

Class Method Summary collapse

Class Method Details

.GetDataIdentifier(graph:, meta: FAIRChampionHarvester::MetadataObject.new) ⇒ Object

send it the graph



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/common_queries.rb', line 70

def self.GetDataIdentifier(graph:, meta: FAIRChampionHarvester::MetadataObject.new) # send it the graph
  @identifier = nil
  g = graph
  # warn "querying graph of size #{g.size}"
  # warn "#{g.dump(:ntriples)}\n\n\n"
  meta.comments << "INFO: SPARQLing graph of size #{graph.size}.\n"

  FAIRChampionHarvester::Utils::DATA_PREDICATES.each do |prop|
    meta.comments << "INFO: SPARQLing for #{prop}.\n"
    if prop =~ %r{schema\.org/distribution}
      # query = SPARQL.parse("select ?o where {
      #                                 VALUES ?schemaurl {<http://schema.org/contentUrl> <https://schema.org/contentUrl>}
      #                                 VALUES ?dist {<http://schema.org/distribution> <https://schema.org/distribution>}
      #                                 ?s ?dist ?b .
      #                                 ?b  ?schemaurl ?o}")
      query = SPARQL.parse("select ?o where {

                                      VALUES ?dist {<http://schema.org/distribution> <https://schema.org/distribution>}
                                      ?s ?dist ?b .
                                      }")

      results = query.execute(g)
      if results.any?
        unless results.first[:o].respond_to? :value
          meta.comments << "INFO: '#{prop}' data identifier did not have the expected structure.  Moving on.\n"
          next
        end
        @identifier = results.first[:o].value
        meta.comments << "INFO: found identifier '#{@identifier}' using Schema.org distribution property.\n"
        return @identifier
      else
        meta.comments << "INFO: '#{prop}' did not result in any query match.\n"
      end

    elsif prop =~ /dcat\#/
      query = SPARQL.parse("select ?b where {
                                  ?s <#{prop}> ?o .}")
      results = query.execute(g)
      if results.any?
        unless results.first[:o].respond_to? :value
          meta.comments << "INFO: '#{prop}' data identifier did not have the expected structure.  Moving on.\n"
          next
        end
        @identifier = results.first[:b].value
        meta.comments << "INFO: found data identifier '#{@identifier}' using DCAT '#{prop}' property.\n"
        return @identifier
      else
        meta.comments << "INFO: '#{prop}' did not result in any query match.\n"
      end
    elsif prop =~ /mainEntity/
      query = SPARQL.parse("select ?o where {
                                  VALUES ?schemaidentifier {<http://schema.org/identifier> <https://schema.org/identifier>}
                                  ?s <#{prop}> ?entity .
                ?entity  ?schemaidentifier ?o}")
      results = query.execute(g)
      if results.any?
        unless results.first[:o].respond_to? :value
          meta.comments << "INFO: '#{prop}' data identifier did not have the expected structure.  Moving on.\n"
          next
        end
        @identifier = results.first[:o].value
        meta.comments << "INFO: found identifier '#{@identifier}' using schema:mainEntity containing a schema:identifier clause.\n"
        return @identifier
      else
        meta.comments << "INFO: '#{prop}' did not result in any query match.\n"
      end

    else
      query = SPARQL.parse("select ?o where {?s <#{prop}> ?o}")
      results = query.execute(g)
      if results.any?
        unless results.first[:o].respond_to? :value
          meta.comments << "INFO: '#{prop}' data identifier did not have the expected structure.  Moving on.\n"
          next
        end
        @identifier = results.first[:o].value
        meta.comments << "INFO: found identifier '#{@identifier}' using #{prop}.\n"
        return @identifier
      else
        meta.comments << "INFO: '#{prop}' did not result in any query match.\n"
      end
    end
  end
  meta.comments << "INFO: No data identifier found in this chunk of metadata.\n"

  @identifier # returns nil if we get to this line
end

.GetSelfIdentifier(g, meta = FAIRChampionHarvester::MetadataObject.new) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/common_queries.rb', line 3

def self.GetSelfIdentifier(g, meta = FAIRChampionHarvester::MetadataObject.new)
  identifiers = []

  FAIRChampionHarvester::Utils::SELF_IDENTIFIER_PREDICATES.each do |prop|
    if prop =~ %r{schema\.org/identifier}
      # test 1 - this assumes that the identifier node attached to "root" is the one we are looking for
      # and assumes the PropertyValue schema for the value of identifier
      query = SPARQL.parse("select ?identifier where {
                                  VALUES ?predi {<http://schema.org/identifier> <https://schema.org/identifier>}
                                  VALUES ?predpv {<http://schema.org/PropertyValue> <https://schema.org/PropertyValue>}
                                  VALUES ?predval {<http://schema.org/value> <https://schema.org/value>}
                                  ?s ?predi ?i .
                                  ?i a ?predpv .
                                  ?i ?predval ?identifier .
                                  FILTER NOT EXISTS {?sub ?pred ?s} } #must be the root, if not, we don't know what id it is!
              ")
      results = query.execute(g)
      if results.any?
        results.each do |r|
          unless r[:identifier].respond_to? :value
            meta.comments << "INFO: '#{prop}' PropertyValue did not have the expected structure.  Moving on.\n"
            next
          end

          identifier = r[:identifier].value
          meta.comments << "INFO: found identifier '#{identifier}' using Schema.org identifier as PropertyValue.\n"
          identifiers << identifier
        end
      else
        # g.each_statement {|s| $stderr.puts s.subject, s.predicate, s.object, "\n"}
        # test 2 - a simple URL or a value from schema
        # $stderr.puts "QUEWRY: select ?identifier where {?s <#{prop}> ?identifier}"
        query = SPARQL.parse("select ?identifier where {?s <#{prop}> ?identifier}")
        results = query.execute(g)
        if results.any?
          results.each do |r|
            # $stderr.puts "inspecting results from query #{r.inspect}"
            unless r[:identifier].respond_to? :value
              meta.comments << "INFO: '#{prop}' as a simple value did not have the expected structure.  Moving on.\n"
              next
            end
            identifier = r[:identifier].value
            meta.comments << "INFO: found identifier '#{identifier}' using Schema.org identifier as with a string or URI value.\n"
            identifiers << identifier
          end
        end
      end
    else
      query = SPARQL.parse("select ?identifier where {?s <#{prop}> ?identifier}")
      results = query.execute(g)
      if results.any?
        results.each do |r|
          unless r[:identifier].respond_to? :value
            meta.comments << "INFO: '#{prop}' as a simple identifier predicate did not have the expected structure.  Moving on.\n"
            next
          end
          identifier = r[:identifier].value
          meta.comments << "INFO: found identifier '#{identifier}' using #{prop} as a string or URI.\n"
          identifiers << identifier
        end
      end
    end
  end

  identifiers
end