Module: Html2rss::AutoSource::Scraper::Schema::CategoryExtractor
- Defined in:
- lib/html2rss/auto_source/scraper/schema/category_extractor.rb
Overview
Extracts categories from Schema.org structured data.
Class Method Summary collapse
-
.call(schema_object) ⇒ Array<String>
Extracts categories from a schema object.
-
.extract_about_array(about) ⇒ Set<String>
Extracts categories from an about array.
-
.extract_about_categories(schema_object) ⇒ Set<String>
Extracts categories from the about field.
-
.extract_field_categories(schema_object) ⇒ Set<String>
Extracts categories from keywords, categories, and tags fields.
-
.extract_field_value(schema_object, field) ⇒ Set<String>
Extracts categories from a single field value.
-
.extract_string_categories(string) ⇒ Set<String>
Extracts categories from a string by splitting on separators.
Class Method Details
.call(schema_object) ⇒ Array<String>
Extracts categories from a schema object.
15 16 17 18 19 20 21 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 15 def self.call(schema_object) # Build union of all category sources field_categories = extract_field_categories(schema_object) about_categories = extract_about_categories(schema_object) (field_categories | about_categories).to_a end |
.extract_about_array(about) ⇒ Set<String>
Extracts categories from an about array.
78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 78 def self.extract_about_array(about) Set.new.tap do |categories| about.each do |item| if item.is_a?(Hash) && item[:name] categories.add(item[:name].to_s) elsif item.is_a?(String) categories.add(item) end end end end |
.extract_about_categories(schema_object) ⇒ Set<String>
Extracts categories from the about field.
41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 41 def self.extract_about_categories(schema_object) about = schema_object[:about] return Set.new unless about if about.is_a?(Array) extract_about_array(about) elsif about.is_a?(String) extract_string_categories(about) else Set.new end end |
.extract_field_categories(schema_object) ⇒ Set<String>
Extracts categories from keywords, categories, and tags fields.
28 29 30 31 32 33 34 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 28 def self.extract_field_categories(schema_object) Set.new.tap do |categories| %w[keywords categories tags].each do |field| categories.merge(extract_field_value(schema_object, field)) end end end |
.extract_field_value(schema_object, field) ⇒ Set<String>
Extracts categories from a single field value.
60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 60 def self.extract_field_value(schema_object, field) value = schema_object[field.to_sym] return Set.new unless value if value.is_a?(Array) Set.new(value.map(&:to_s).reject(&:empty?)) elsif value.is_a?(String) extract_string_categories(value) else Set.new end end |
.extract_string_categories(string) ⇒ Set<String>
Extracts categories from a string by splitting on separators.
95 96 97 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 95 def self.extract_string_categories(string) Set.new(string.split(/[,;|]/).map(&:strip).reject(&:empty?)) end |