Class: Relaton::Jis::DataFetcher
- Inherits:
-
Core::DataFetcher
- Object
- Core::DataFetcher
- Relaton::Jis::DataFetcher
- Defined in:
- lib/relaton/jis/data_fetcher.rb
Constant Summary collapse
- URL =
"https://webdesk.jsa.or.jp/books/"
Instance Method Summary collapse
- #agent ⇒ Object
- #count ⇒ Object
- #create_thread_pool(size) ⇒ Object
- #end_threads_and_wait ⇒ Object
- #fetch(_source = nil) ⇒ Object
-
#fetch_doc(url) ⇒ Object
rubocop:disable Metrics/MethodLength.
-
#get_next_page(offset) ⇒ Object
rubocop:disable Metrics/MethodLength.
- #index ⇒ Object
-
#index_v2 ⇒ Object
Pubid-based index built in parallel with the legacy string index.
- #initial_post ⇒ Object
-
#initialize(output, format) ⇒ DataFetcher
constructor
A new instance of DataFetcher.
- #log_error(msg) ⇒ Object
-
#parse_offset(resp) ⇒ Object
rubocop:disable Metrics/AbcSize.
- #parse_page(resp) ⇒ Object
-
#pubid(id) ⇒ Object
Parse a primary docidentifier string into a pubid identifier; nil (with a warning) if pubid can’t parse it, so a single bad id never aborts the crawl or corrupts index-v2.
-
#save_doc(bib, url) ⇒ Object
rubocop:disable Metrics/MethodLength.
- #to_bibxml(bib) ⇒ Object
- #to_xml(bib) ⇒ Object
- #to_yaml(bib) ⇒ Object
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Returns a new instance of DataFetcher.
11 12 13 14 15 16 |
# File 'lib/relaton/jis/data_fetcher.rb', line 11 def initialize(output, format) super @queue = SizedQueue.new 10 @threads = create_thread_pool 5 @mutex = Mutex.new end |
Instance Method Details
#agent ⇒ Object
107 108 109 |
# File 'lib/relaton/jis/data_fetcher.rb', line 107 def agent @agent ||= Mechanize.new end |
#count ⇒ Object
140 141 142 |
# File 'lib/relaton/jis/data_fetcher.rb', line 140 def count @count.to_i end |
#create_thread_pool(size) ⇒ Object
58 59 60 61 62 63 64 65 66 |
# File 'lib/relaton/jis/data_fetcher.rb', line 58 def create_thread_pool(size) Array.new(size) do Thread.new do until (url = @queue.shift) == :END fetch_doc url end end end end |
#end_threads_and_wait ⇒ Object
134 135 136 137 138 |
# File 'lib/relaton/jis/data_fetcher.rb', line 134 def end_threads_and_wait @threads.size.times { @queue << :END } @queue.close @threads.each(&:join) end |
#fetch(_source = nil) ⇒ Object
86 87 88 89 90 91 92 93 94 |
# File 'lib/relaton/jis/data_fetcher.rb', line 86 def fetch(_source = nil) return unless initial_post resp = agent.get "#{URL}W11M0070/index" parse_page resp index.save index_v2.save report_errors end |
#fetch_doc(url) ⇒ Object
rubocop:disable Metrics/MethodLength
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/relaton/jis/data_fetcher.rb', line 68 def fetch_doc(url) # rubocop:disable Metrics/MethodLength attempts = 0 begin bib = Scraper.new(url, @errors).fetch rescue StandardError => e attempts += 1 if attempts < 5 sleep 2 retry else Util.warn "URL: #{url}" Util.warn "#{e.}\n#{e.backtrace[0..6].join("\n")}" end else save_doc bib, url end end |
#get_next_page(offset) ⇒ Object
rubocop:disable Metrics/MethodLength
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/relaton/jis/data_fetcher.rb', line 144 def get_next_page(offset) # rubocop:disable Metrics/MethodLength attempts = 0 begin if initial_post url = "#{URL}W11M0070/getAddList" agent.post url, search_type: "JIS", offset: offset end rescue StandardError => e attempts += 1 if attempts < 5 sleep 2 retry else Util.warn "#{e.}\n#{e.backtrace[0..6].join("\n")}" end end end |
#index ⇒ Object
22 23 24 |
# File 'lib/relaton/jis/data_fetcher.rb', line 22 def index @index ||= Relaton::Index.find_or_create :jis, file: "#{INDEXFILE}.yaml" end |
#index_v2 ⇒ Object
Pubid-based index built in parallel with the legacy string index. The pool keys by type, so requesting a second :jis index with a different file evicts the v1 Type from the pool, but we keep our own reference in @index, so both stay live for the duration of the crawl.
30 31 32 33 34 |
# File 'lib/relaton/jis/data_fetcher.rb', line 30 def index_v2 @index_v2 ||= Relaton::Index.find_or_create( :jis, file: "#{INDEXFILE_V2}.yaml", pubid_class: ::Pubid::Jis::Identifier ) end |
#initial_post ⇒ Object
96 97 98 99 100 101 102 103 104 105 |
# File 'lib/relaton/jis/data_fetcher.rb', line 96 def initial_post return true if @initial_time && Time.now - @initial_time < 600 body = { record: 0, dantai: "JIS", searchtype2: 1, status_1: 1, status_2: 2 } # rubocop:disable Naming/VariableNumber resp = agent.post "#{URL}W11M0270/index", body disp = JSON.parse resp.body @initial_time = Time.now disp["status"] || Util.warn("No results found for JIS") end |
#log_error(msg) ⇒ Object
18 19 20 |
# File 'lib/relaton/jis/data_fetcher.rb', line 18 def log_error(msg) Util.error msg end |
#parse_offset(resp) ⇒ Object
rubocop:disable Metrics/AbcSize
123 124 125 126 127 128 129 130 131 132 |
# File 'lib/relaton/jis/data_fetcher.rb', line 123 def parse_offset(resp) # rubocop:disable Metrics/AbcSize if resp.at('//*[@id="btnPaging"]') # first page xpath = '//script[contains(.,"var count =")]' @count = resp.at(xpath).text.match(/var count = (\d+);/)[1] resp.at("//*[@id='offset']")[:value].to_i else script = resp.at("//script").text script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i end end |
#parse_page(resp) ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/relaton/jis/data_fetcher.rb', line 111 def parse_page(resp) while resp xpath = '//div[@class="blockGenaral"]/a' resp.xpath(xpath).each { |a| @queue << a[:href] } offset = parse_offset resp break if offset >= count resp = get_next_page(offset) end end_threads_and_wait end |
#pubid(id) ⇒ Object
Parse a primary docidentifier string into a pubid identifier; nil (with a warning) if pubid can’t parse it, so a single bad id never aborts the crawl or corrupts index-v2.
39 40 41 42 43 44 |
# File 'lib/relaton/jis/data_fetcher.rb', line 39 def pubid(id) ::Pubid::Jis::Identifier.parse id rescue StandardError => e Util.warn "Failed to parse `#{id}` with pubid: #{e.}" nil end |
#save_doc(bib, url) ⇒ Object
rubocop:disable Metrics/MethodLength
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# File 'lib/relaton/jis/data_fetcher.rb', line 162 def save_doc(bib, url) # rubocop:disable Metrics/MethodLength return unless bib id = bib.docidentifier.find(&:primary).content file = output_file id @mutex.synchronize do if @files.include?(file) Util.warn "File #{file} already exists. Duplication URL: #{url}" else @files << file File.write file, serialize(bib), encoding: "UTF-8" index.add_or_update id, file pid = pubid id index_v2.add_or_update pid, file if pid end end end |
#to_bibxml(bib) ⇒ Object
54 55 56 |
# File 'lib/relaton/jis/data_fetcher.rb', line 54 def to_bibxml(bib) bib.to_rfcxml end |
#to_xml(bib) ⇒ Object
50 51 52 |
# File 'lib/relaton/jis/data_fetcher.rb', line 50 def to_xml(bib) Bibdata.to_xml bib end |
#to_yaml(bib) ⇒ Object
46 47 48 |
# File 'lib/relaton/jis/data_fetcher.rb', line 46 def to_yaml(bib) Item.to_yaml bib end |