Module: Html2rss

Defined in:
lib/html2rss.rb,
lib/html2rss/cli.rb,
lib/html2rss/url.rb,
lib/html2rss/error.rb,
lib/html2rss/config.rb,
lib/html2rss/version.rb,
lib/html2rss/hash_util.rb,
lib/html2rss/rendering.rb,
lib/html2rss/selectors.rb,
lib/html2rss/auto_source.rb,
lib/html2rss/rss_builder.rb,
lib/html2rss/config/schema.rb,
lib/html2rss/configuration.rb,
lib/html2rss/feed_pipeline.rb,
lib/html2rss/html_extractor.rb,
lib/html2rss/html_navigator.rb,
lib/html2rss/blocked_surface.rb,
lib/html2rss/request_service.rb,
lib/html2rss/request_session.rb,
lib/html2rss/config/validator.rb,
lib/html2rss/request_controls.rb,
lib/html2rss/selectors/config.rb,
lib/html2rss/json_feed_builder.rb,
lib/html2rss/category_extractor.rb,
lib/html2rss/auto_source/cleanup.rb,
lib/html2rss/auto_source/scraper.rb,
lib/html2rss/rss_builder/article.rb,
lib/html2rss/rss_builder/channel.rb,
lib/html2rss/config/class_methods.rb,
lib/html2rss/selectors/extractors.rb,
lib/html2rss/articles/deduplicator.rb,
lib/html2rss/config/dynamic_params.rb,
lib/html2rss/rss_builder/enclosure.rb,
lib/html2rss/config/request_headers.rb,
lib/html2rss/json_feed_builder/item.rb,
lib/html2rss/rendering/pdf_renderer.rb,
lib/html2rss/request_service/budget.rb,
lib/html2rss/request_service/policy.rb,
lib/html2rss/rss_builder/stylesheet.rb,
lib/html2rss/request_service/context.rb,
lib/html2rss/auto_source/scraper/html.rb,
lib/html2rss/rendering/audio_renderer.rb,
lib/html2rss/rendering/image_renderer.rb,
lib/html2rss/rendering/media_renderer.rb,
lib/html2rss/rendering/video_renderer.rb,
lib/html2rss/request_service/response.rb,
lib/html2rss/request_service/strategy.rb,
lib/html2rss/selectors/extractors/href.rb,
lib/html2rss/selectors/extractors/html.rb,
lib/html2rss/selectors/extractors/text.rb,
lib/html2rss/selectors/post_processors.rb,
lib/html2rss/auto_source/scraper/schema.rb,
lib/html2rss/feed_pipeline/auto_fallback.rb,
lib/html2rss/selectors/extractors/static.rb,
lib/html2rss/config/multiple_feeds_config.rb,
lib/html2rss/auto_source/scraper/microdata.rb,
lib/html2rss/html_extractor/date_extractor.rb,
lib/html2rss/rendering/description_builder.rb,
lib/html2rss/request_session/runtime_input.rb,
lib/html2rss/auto_source/scraper/json_state.rb,
lib/html2rss/html_extractor/image_extractor.rb,
lib/html2rss/html_extractor/list_candidates.rb,
lib/html2rss/request_service/response_guard.rb,
lib/html2rss/request_session/rel_next_pager.rb,
lib/html2rss/request_session/runtime_policy.rb,
lib/html2rss/selectors/extractors/attribute.rb,
lib/html2rss/selectors/post_processors/base.rb,
lib/html2rss/selectors/post_processors/gsub.rb,
lib/html2rss/auto_source/scraper/schema/thing.rb,
lib/html2rss/request_service/faraday_strategy.rb,
lib/html2rss/request_service/puppet_commander.rb,
lib/html2rss/auto_source/scraper/semantic_html.rb,
lib/html2rss/auto_source/scraper/wordpress_api.rb,
lib/html2rss/selectors/object_to_xml_converter.rb,
lib/html2rss/html_extractor/enclosure_extractor.rb,
lib/html2rss/html_extractor/semantic_containers.rb,
lib/html2rss/selectors/post_processors/template.rb,
lib/html2rss/auto_source/scraper/link_heuristics.rb,
lib/html2rss/request_service/botasaurus_contract.rb,
lib/html2rss/request_service/botasaurus_strategy.rb,
lib/html2rss/selectors/post_processors/parse_uri.rb,
lib/html2rss/selectors/post_processors/substring.rb,
lib/html2rss/auto_source/scraper/schema/item_list.rb,
lib/html2rss/auto_source/scraper/schema/list_item.rb,
lib/html2rss/request_service/browserless_strategy.rb,
lib/html2rss/selectors/post_processors/parse_time.rb,
lib/html2rss/selectors/post_processors/sanitize_html.rb,
lib/html2rss/html_extractor/semantic_anchor_candidates.rb,
lib/html2rss/selectors/post_processors/html_to_markdown.rb,
lib/html2rss/selectors/post_processors/markdown_to_html.rb,
lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb,
lib/html2rss/auto_source/scraper/schema/category_extractor.rb,
lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb,
lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb,
lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb,
lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb,
lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb

Overview

Main html2rss namespace.

Defined Under Namespace

Modules: Articles, BlockedSurface, HashUtil, Log, Rendering Classes: AutoSource, CLI, CategoryExtractor, Config, Configuration, Error, FeedPipeline, HtmlExtractor, HtmlNavigator, JsonFeedBuilder, NoFeedItemsExtracted, RequestControls, RequestService, RequestSession, RssBuilder, Selectors, Url

Constant Summary collapse

VERSION =

Current application version.

'0.20.0'

Class Method Summary collapse

Class Method Details

.auto_json_feed(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil) ⇒ Hash

Scrapes the provided URL and returns a JSONFeed 1.1 hash.

Parameters:

  • url (String)

    source page URL

  • strategy (Symbol) (defaults to: :auto)

    request strategy to use

  • items_selector (String, nil) (defaults to: nil)

    optional selector hint for item extraction

  • max_redirects (Integer, nil) (defaults to: nil)

    optional redirect limit override

  • max_requests (Integer, nil) (defaults to: nil)

    optional request budget override

Returns:

  • (Hash)

    JSONFeed-compliant hash



77
78
79
# File 'lib/html2rss.rb', line 77

def self.auto_json_feed(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
  json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
end

.auto_source(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil) ⇒ RSS::Rss

Scrapes the provided URL and returns an RSS object.

Parameters:

  • url (String)

    source page URL

  • strategy (Symbol) (defaults to: :auto)

    request strategy to use

  • items_selector (String, nil) (defaults to: nil)

    optional selector hint for item extraction

  • max_redirects (Integer, nil) (defaults to: nil)

    optional redirect limit override

  • max_requests (Integer, nil) (defaults to: nil)

    optional request budget override

Returns:

  • (RSS::Rss)

    generated RSS feed



64
65
66
# File 'lib/html2rss.rb', line 64

def self.auto_source(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
  feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
end

.config_from_yaml_file(file, feed_name = nil) ⇒ Hash{Symbol => Object}

Loads a feed configuration from YAML.

Parameters:

  • file (String)

    path to the YAML file

  • feed_name (String, nil) (defaults to: nil)

    optional feed name inside a multi-feed config

Returns:

  • (Hash{Symbol => Object})

    loaded configuration hash



33
34
35
# File 'lib/html2rss.rb', line 33

def self.config_from_yaml_file(file, feed_name = nil)
  Config.load_yaml(file, feed_name)
end

.configurationHtml2rss::Configuration

Returns the global configuration instance.

Returns:



85
86
87
# File 'lib/html2rss.rb', line 85

def configuration
  @configuration ||= Configuration.new.freeze
end

.configure {|config| ... } ⇒ Html2rss::Configuration

Configures global library defaults.

Yield Parameters:

Returns:



94
95
96
97
98
# File 'lib/html2rss.rb', line 94

def configure
  config = configuration.dup
  yield config
  @configuration = config.freeze
end

.feed(raw_config) ⇒ RSS::Rss

Returns an RSS object generated from the provided configuration.

Parameters:

  • raw_config (Hash{Symbol => Object})

    feed configuration

Returns:

  • (RSS::Rss)

    generated RSS feed



42
43
44
# File 'lib/html2rss.rb', line 42

def self.feed(raw_config)
  FeedPipeline.new(raw_config).to_rss
end

.json_feed(raw_config) ⇒ Hash

Returns a JSONFeed 1.1 hash generated from the provided configuration.

Parameters:

  • raw_config (Hash{Symbol => Object})

    feed configuration

Returns:

  • (Hash)

    JSONFeed-compliant hash



51
52
53
# File 'lib/html2rss.rb', line 51

def self.json_feed(raw_config)
  FeedPipeline.new(raw_config).to_json_feed
end

.loggerObject

Returns the logger.

Returns:

  • (Object)

    the logger



102
103
104
# File 'lib/html2rss.rb', line 102

def logger
  configuration.logger
end

.logger=(logger) ⇒ Object

Parameters:

  • logger (Object)

    the new logger



108
109
110
# File 'lib/html2rss.rb', line 108

def logger=(logger)
  configure { |config| config.logger = logger }
end