Module: Html2rss::AutoSource::Scraper::Schema::CategoryExtractor
- Defined in:
- lib/html2rss/auto_source/scraper/schema/category_extractor.rb
Overview
Extracts categories from Schema.org structured data.
Class Method Summary collapse
-
.call(schema_object) ⇒ Array<String>
Extracts categories from a schema object.
-
.extract_about_array(about) ⇒ Set<String>
Extracts categories from an about array.
-
.extract_about_array!(categories, about) ⇒ void
Extracts categories from an about array.
-
.extract_about_categories(schema_object) ⇒ Set<String>
Extracts categories from the about field.
-
.extract_about_categories!(categories, schema_object) ⇒ void
Extracts categories from the about field.
-
.extract_field_categories(schema_object) ⇒ Set<String>
Extracts categories from keywords, categories, and tags fields.
-
.extract_field_categories!(categories, schema_object) ⇒ void
Extracts categories from keywords, categories, and tags fields.
-
.extract_field_value(schema_object, field) ⇒ Set<String>
Extracts categories from a single field value.
-
.extract_field_value!(categories, value) ⇒ void
Extracts categories from a single field value.
-
.extract_string_categories(string) ⇒ Set<String>
Extracts categories from a string by splitting on separators.
-
.extract_string_categories!(categories, string) ⇒ void
Extracts categories from a string by splitting on separators.
Class Method Details
.call(schema_object) ⇒ Array<String>
Extracts categories from a schema object.
15 16 17 18 19 20 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 15 def self.call(schema_object) Set.new.tap do |categories| extract_field_categories!(categories, schema_object) extract_about_categories!(categories, schema_object) end.to_a end |
.extract_about_array(about) ⇒ Set<String>
Extracts categories from an about array.
103 104 105 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 103 def self.extract_about_array(about) Set.new.tap { |categories| extract_about_array!(categories, about) } end |
.extract_about_array!(categories, about) ⇒ void
This method returns an undefined value.
Extracts categories from an about array.
113 114 115 116 117 118 119 120 121 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 113 def self.extract_about_array!(categories, about) about.each do |item| if item.is_a?(Hash) && item[:name] categories.add(item[:name].to_s) elsif item.is_a?(String) categories.add(item) end end end |
.extract_about_categories(schema_object) ⇒ Set<String>
Extracts categories from the about field.
48 49 50 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 48 def self.extract_about_categories(schema_object) Set.new.tap { |categories| extract_about_categories!(categories, schema_object) } end |
.extract_about_categories!(categories, schema_object) ⇒ void
This method returns an undefined value.
Extracts categories from the about field.
58 59 60 61 62 63 64 65 66 67 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 58 def self.extract_about_categories!(categories, schema_object) about = schema_object[:about] return unless about if about.is_a?(Array) extract_about_array!(categories, about) elsif about.is_a?(String) extract_string_categories!(categories, about) end end |
.extract_field_categories(schema_object) ⇒ Set<String>
Extracts categories from keywords, categories, and tags fields.
27 28 29 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 27 def self.extract_field_categories(schema_object) Set.new.tap { |categories| extract_field_categories!(categories, schema_object) } end |
.extract_field_categories!(categories, schema_object) ⇒ void
This method returns an undefined value.
Extracts categories from keywords, categories, and tags fields.
37 38 39 40 41 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 37 def self.extract_field_categories!(categories, schema_object) %i[keywords categories tags].each do |field| extract_field_value!(categories, schema_object[field]) end end |
.extract_field_value(schema_object, field) ⇒ Set<String>
Extracts categories from a single field value.
75 76 77 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 75 def self.extract_field_value(schema_object, field) Set.new.tap { |categories| extract_field_value!(categories, schema_object[field.to_sym]) } end |
.extract_field_value!(categories, value) ⇒ void
This method returns an undefined value.
Extracts categories from a single field value.
85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 85 def self.extract_field_value!(categories, value) return unless value if value.is_a?(Array) value.each do |item| s = item.to_s categories.add(s) unless s.empty? end elsif value.is_a?(String) extract_string_categories!(categories, value) end end |
.extract_string_categories(string) ⇒ Set<String>
Extracts categories from a string by splitting on separators.
128 129 130 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 128 def self.extract_string_categories(string) Set.new.tap { |categories| extract_string_categories!(categories, string) } end |
.extract_string_categories!(categories, string) ⇒ void
This method returns an undefined value.
Extracts categories from a string by splitting on separators.
138 139 140 141 142 143 |
# File 'lib/html2rss/auto_source/scraper/schema/category_extractor.rb', line 138 def self.extract_string_categories!(categories, string) string.split(/[,;|]/).each do |part| s = part.strip categories.add(s) unless s.empty? end end |