11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
# File 'lib/extruct.rb', line 11
def self.do_extruct(meta, uri, content_type: nil, body_prefix: nil)
if content_type&.match?(BINARY_CONTENT_TYPE)
meta. << "INFO: Skipping extruct for #{uri} — " \
"binary content-type '#{content_type}' is not HTML-parseable.\n"
return
end
if body_prefix&.match?(BINARY_MAGIC_BYTES)
meta. << "INFO: Skipping extruct for #{uri} — binary file signature detected in response body.\n"
return
end
meta. << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n"
warn "begin open3"
stdout, stderr, status = Open3.capture3(FAIRChampionHarvester::Utils::ExtructCommand + " " + uri)
warn ""
warn "open3 status: #{status} #{stdout}"
result = stderr
if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
meta. << "WARN: extruct threw an error #{::Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
if result.to_s.match(/(ValueError:.*?)\n/)
meta. << "WARN: extruct error was #{::Regexp.last_match(1)}\n"
end
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) begin
json = JSON.parse result
rescue StandardError
warn "json parsing failed! This is bad!\n"
meta. << "INFO: the extruct tool found non-parseable data at #{uri}. Aborting attempt to read it\n"
return
end
meta. << "INFO: the extruct tool found parseable data at #{uri}\n"
Core.parse_rdf(meta, json["json-ld"].to_json, "application/ld+json") if json["json-ld"].any? meta.merge_hash(json["microdata"].first) if json["microdata"].any?
meta.merge_hash(json["microformat"].first) if json["microformat"].any?
meta.merge_hash(json["opengraph"].first) if json["opengraph"].any?
Core.parse_rdf(meta, json["rdfa"].to_json, "application/ld+json") if json["rdfa"].any?
meta.merge_hash(json.first) if json.first.is_a? Hash
else
meta. << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
end
end
|