Class: FAIRChampionHarvester::Tika
- Inherits:
-
Object
- Object
- FAIRChampionHarvester::Tika
- Defined in:
- lib/tika.rb
Class Method Summary collapse
Class Method Details
.do_tika(meta, body) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/tika.rb', line 5 def self.do_tika(, body) digest = Digest::SHA256.hexdigest(body.to_s) if @seen_digests.include?(digest) .comments << "INFO: Tika skipping body — identical content already parsed.\n" return end @seen_digests << digest file = Tempfile.new("foo") file.binmode file.write(body) file.rewind .comments << "INFO: The message body is being examined by Apache Tika\n" stdout, stderr, status = Open3.capture3( "curl", "--silent", "--show-error", "--connect-timeout", "10", "-T", file.path, "--header", "Accept: application/rdf+xml", FAIRChampionHarvester::Utils::TikaCommand ) file.close file.unlink # deletes the temp file unless status.success? .comments << "WARN: Tika curl call failed (exit #{status.exitstatus}): #{stderr.strip}\n" return end .comments << "INFO: The response from Apache Tika is being parsed\n" FAIRChampionHarvester::Tika.parse_tika_output(, stdout) end |
.parse_tika_output(meta, output) ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/tika.rb', line 37 def self.parse_tika_output(, output) # $stderr.puts "\n\n\n\n\nTIKA OUTPUT\n\nX#{output}X\n\n\n\n\n" # annoyingly, when you ask Tika for rdfxml, it gives it to you INSIDE an XML element # meaning that you cannot directly parse it as RDF. Grrrrrrr.... .comments << "INFO: entering Tika parser - sample of input #{output[0..50]}.\n" unless output[0] == "<" # check if it is XML .comments << "CRITICAL: Tika parser expected XML. Aborting. \n" return end xml = Nokogiri::XML(output) rdf = xml.xpath("//rdf:RDF", "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#") rdf_string = rdf.to_xml r = RDF::Format.for(content_type: "application/rdf+xml").reader.new(rdf_string) g = RDF::Graph.new << r .merge_rdf(g.statements) .comments << "INFO: Tika executed successfully (this doesn't necessarily mean that it discovered any metadata...)\n" end |