3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
# File 'lib/url.rb', line 3
def self.resolve_url(guid:, meta:, nolinkheaders: false, headers: FAIRChampionHarvester::Utils::AcceptHeader)
meta.guidtype = "uri" if meta.guidtype.nil?
warn "\n\n FETCHING #{guid} #{}\n\n"
head, body = Core.fetch(guid: guid, headers: , meta: meta)
unless head
meta. << "WARN: Unable to resolve #{guid} using HTTP Accept header #{}.\n"
return meta
end
meta. << "INFO: following redirection using this header led to the following URL: #{meta.finalURI.last}. Using the output from this URL for the next few tests..."
meta.full_response << body
links = []
links << Core.(head) unless
links << Core.parse_link_body_headers(guid, body) unless
links.flatten!
links.compact!
warn "\n\n\nLINKS TO FOLLOW: #{links}\n\n\n"
links.each do |link|
meta. << "INFO: a Link 'alternate' or 'meta' header was found: #{link}, and is now being followed as an independent URI that may contain metadata.\n"
FAIRChampionHarvester::URL.resolve_url(guid: link, meta: meta, nolinkheaders: true) meta. << "INFO: parsing of Link #{link} complete.\n"
end meta. << "INFO: Link Header and Meta Link parsing complete. Back in main thread.\n"
parser, contenttype = Core.figure_out_type(head)
meta. << "INFO: Found #{parser} #{contenttype} type of content when resolving #{guid} using HTTP Accept header #{}.\n"
warn "\n\nFound #{parser} type of file by resolving GUID #{guid}. \n\n"
if FAIRChampionHarvester::Utils::TEXT_FORMATS.keys.include?(parser)
warn "\n\nPARSING TEXT\n\n"
meta. << "INFO: parsing as plaintext. \n"
Core.parse_text(meta, body)
elsif FAIRChampionHarvester::Utils::RDF_FORMATS.keys.include?(parser)
warn "\n\nPARSING RDF\n\n"
meta. << "INFO: parsing as linked data. \n"
if contenttype == "application/trig"
Core.parse_rdf(meta, body, contenttype)
else
Core.parse_rdf(meta, body)
end
elsif FAIRChampionHarvester::Utils::HTML_FORMATS.keys.include?(parser)
meta. << "INFO: parsing as HTML. \n"
warn "\n\nPARSING HTML\n\n"
url = if meta.finalURI.last =~ %r{^\w+://}
meta.finalURI.last
else
guid
end
meta. << "INFO: Now attempting to use the extruct parser. \n"
FAIRChampionHarvester::Extruct.do_extruct(meta, url, content_type: head[:content_type], body_prefix: body[0, 8])
meta. << "INFO: Now attempting to use the Kellogg's Distiller parser. \n"
meta. << "INFO: Note that, if the Distiller fails, you can view the output of its parse by visiting http://rdf.greggkellogg.net/distiller?command=serialize&url=#{CGI.escape(url.to_s)}. \n"
FAIRChampionHarvester::Distiller.do_distiller(meta, body)
elsif FAIRChampionHarvester::Utils::XML_FORMATS.keys.include?(parser)
meta. << "INFO: parsing as XML. \n"
warn "\n\nPARSING XML\n\n"
Core.parse_xml(meta, body)
elsif FAIRChampionHarvester::Utils::JSON_FORMATS.keys.include?(parser)
meta. << "INFO: parsing as JSON. \n"
warn "\n\nPARSING JSON\n\n"
Core.parse_json(meta, body)
else
meta. << "INFO: Body of the message did not match known structured data types. \n"
warn "\n\nPARSING UNKNOWN\n\n"
url = if meta.finalURI.last =~ %r{^\w+://}
meta.finalURI.last
else
guid
end
warn "\n\nPARSING UNKNOWN from #{url}\n\n"
meta. << "WARN: parser could not be found. \n"
warn "\n\nPARSING WITH TIKA\n\n"
meta. << "INFO: Metadata may be embedded, now searching using the Apache 'tika' tool.\n"
FAIRChampionHarvester::Tika.do_tika(meta, body) warn "\n\nPARSING WITH DISTILLER\n\n"
meta. << "INFO: Metadata may be embedded, now searching using the 'Distiller' tool.\n"
FAIRChampionHarvester::Distiller.do_distiller(meta, body)
warn "\n\nPARSING WITH EXTRUCT\n\n"
meta. << "INFO: Metadata may be embedded, now searching using the 'extruct' tool.\n"
FAIRChampionHarvester::Extruct.do_extruct(meta, url, content_type: head[:content_type], body_prefix: body[0, 8])
end
meta
end
|