Class: Html2rss::AutoSource::Scraper::Schema::Thing
- Inherits:
-
Object
- Object
- Html2rss::AutoSource::Scraper::Schema::Thing
- Defined in:
- lib/html2rss/auto_source/scraper/schema/thing.rb
Overview
A Thing is kind of the ‘base class’ for Schema.org schema_objects.
Constant Summary collapse
- SUPPORTED_TYPES =
Supported Schema.org ‘@type` values mapped to article extraction.
%w[ AdvertiserContentArticle AnalysisNewsArticle APIReference Article AskPublicNewsArticle BackgroundNewsArticle BlogPosting DiscussionForumPosting LiveBlogPosting NewsArticle OpinionNewsArticle Report ReportageNewsArticle ReviewNewsArticle SatiricalArticle ScholarlyArticle SocialMediaPosting TechArticle ].to_set.freeze
- DEFAULT_ATTRIBUTES =
Attributes exposed by ‘#call` in generated article hashes.
%i[id title description url image published_at categories].freeze
Instance Attribute Summary collapse
-
#base_url ⇒ Object
readonly
Returns the value of attribute base_url.
-
#schema_object ⇒ Object
readonly
Returns the value of attribute schema_object.
Instance Method Summary collapse
-
#call ⇒ Hash
The scraped article hash with DEFAULT_ATTRIBUTES.
-
#categories ⇒ Array<String>?
Extracted category labels.
-
#description ⇒ String?
Longest available description field.
-
#id ⇒ String?
Stable schema object identifier.
-
#image ⇒ Html2rss::Url?
Normalized article image URL.
-
#image_urls ⇒ Array<String>
Normalized image URL candidates.
-
#initialize(schema_object, url:) ⇒ Thing
constructor
A new instance of Thing.
-
#normalized_base_url(url) ⇒ Html2rss::Url?
Normalized absolute URL for schema resolution.
-
#normalized_id(value, reference_url:) ⇒ String?
Normalized identifier value.
-
#normalized_id_url(text, reference_url:) ⇒ Html2rss::Url
Normalized identifier URL.
-
#normalized_id_value(url) ⇒ String?
Path/query portion used as stable ID.
-
#published_at ⇒ String?
Published-at timestamp string.
-
#title ⇒ String?
Article title.
-
#url ⇒ Html2rss::Url?
The URL of the schema object.
Constructor Details
#initialize(schema_object, url:) ⇒ Thing
Returns a new instance of Thing.
41 42 43 44 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 41 def initialize(schema_object, url:) @schema_object = schema_object @base_url = normalized_base_url(url) end |
Instance Attribute Details
#base_url ⇒ Object (readonly)
Returns the value of attribute base_url.
101 102 103 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 101 def base_url @base_url end |
#schema_object ⇒ Object (readonly)
Returns the value of attribute schema_object.
101 102 103 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 101 def schema_object @schema_object end |
Instance Method Details
#call ⇒ Hash
Returns the scraped article hash with DEFAULT_ATTRIBUTES.
47 48 49 50 51 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 47 def call DEFAULT_ATTRIBUTES.to_h do |attribute| [attribute, public_send(attribute)] end end |
#categories ⇒ Array<String>?
Returns extracted category labels.
95 96 97 98 99 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 95 def categories return @categories if defined?(@categories) @categories = CategoryExtractor.call(schema_object) end |
#description ⇒ String?
Returns longest available description field.
68 69 70 71 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 68 def description schema_object.values_at(:description, :schema_object_body, :abstract) .max_by { |string| string.to_s.size } end |
#id ⇒ String?
Returns stable schema object identifier.
54 55 56 57 58 59 60 61 62 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 54 def id return @id if defined?(@id) id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s return if id.empty? @id = id end |
#image ⇒ Html2rss::Url?
Returns normalized article image URL.
85 86 87 88 89 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 85 def image if (image_url = image_urls.first) Url.from_relative(image_url, base_url || image_url) end end |
#image_urls ⇒ Array<String>
Returns normalized image URL candidates.
104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 104 def image_urls schema_object.values_at(:image, :thumbnailUrl).filter_map do |object| next unless object if object.is_a?(String) object elsif object.is_a?(Hash) && object[:@type] == 'ImageObject' object[:url] || object[:contentUrl] end end end |
#normalized_base_url(url) ⇒ Html2rss::Url?
Returns normalized absolute URL for schema resolution.
154 155 156 157 158 159 160 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 154 def normalized_base_url(url) return if url.to_s.strip.empty? Url.from_absolute(url) rescue ArgumentError nil end |
#normalized_id(value, reference_url:) ⇒ String?
Returns normalized identifier value.
119 120 121 122 123 124 125 126 127 128 129 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 119 def normalized_id(value, reference_url:) text = value.to_s return if text.empty? normalized_url = normalized_id_url(text, reference_url:) return text unless reference_url && normalized_url.host == reference_url.host normalized_id_value(normalized_url) rescue ArgumentError text end |
#normalized_id_url(text, reference_url:) ⇒ Html2rss::Url
Returns normalized identifier URL.
134 135 136 137 138 139 140 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 134 def normalized_id_url(text, reference_url:) if text.start_with?('/') Url.from_relative(text, reference_url || text) else Url.from_absolute(text) end end |
#normalized_id_value(url) ⇒ String?
Returns path/query portion used as stable ID.
144 145 146 147 148 149 150 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 144 def normalized_id_value(url) path = url.path.to_s return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty? return path unless path.empty? url.query end |
#published_at ⇒ String?
Returns published-at timestamp string.
92 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 92 def published_at = schema_object[:datePublished] |
#title ⇒ String?
Returns article title.
65 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 65 def title = schema_object[:title] |
#url ⇒ Html2rss::Url?
Returns the URL of the schema object.
74 75 76 77 78 79 80 81 82 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 74 def url url = schema_object[:url] if url.to_s.empty? Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}") return end Url.from_relative(url, base_url || url) end |