4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
# File 'lib/distiller.rb', line 4
def self.do_distiller(meta, body)
bhash = Digest::SHA256.hexdigest(body)
if @@distillerknown[bhash]
meta. << "INFO: Cached data is already parsed. Returning\n"
return
end
@@distillerknown[bhash] = true
meta. << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n"
file = Tempfile.new("foo", encoding: "UTF-8")
body = body.force_encoding("UTF-8")
body.scrub!
body = body.gsub(%r{"@context"\s*:\s*"https?://schema.org/?"}, '"@context": "https://schema.org/docs/jsonldcontext.json"')
file.write(body)
file.rewind
meta. << "INFO: The message body is being examined by Distiller\n"
command = "LANG=en_US.UTF-8 #{FAIRChampionHarvester::Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
warn "distiller command: #{command}"
result, _stderr, _status = Open3.capture3(command)
warn ""
file.close
file.unlink
result = result.force_encoding("UTF-8")
if result =~ /@context/i
meta. << "INFO: The Distiller found parseable data. Parsing as RDF\n"
Core.parse_rdf(meta, result, "application/ld+json")
else meta. << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML, or simply lack of embedded metadata.\n"
end
end
|