Class: Relaton::Jis::DataFetcher
- Inherits:
-
Core::DataFetcher
- Object
- Core::DataFetcher
- Relaton::Jis::DataFetcher
- Defined in:
- lib/relaton/jis/data_fetcher.rb
Constant Summary collapse
- URL =
"https://webdesk.jsa.or.jp/books/"
Instance Method Summary collapse
- #agent ⇒ Object
- #count ⇒ Object
- #create_thread_pool(size) ⇒ Object
- #end_threads_and_wait ⇒ Object
- #fetch(_source = nil) ⇒ Object
-
#fetch_doc(url) ⇒ Object
rubocop:disable Metrics/MethodLength.
-
#get_next_page(offset) ⇒ Object
rubocop:disable Metrics/MethodLength.
- #index ⇒ Object
- #initial_post ⇒ Object
-
#initialize(output, format) ⇒ DataFetcher
constructor
A new instance of DataFetcher.
- #log_error(msg) ⇒ Object
-
#parse_offset(resp) ⇒ Object
rubocop:disable Metrics/AbcSize.
- #parse_page(resp) ⇒ Object
-
#save_doc(bib, url) ⇒ Object
rubocop:disable Metrics/MethodLength.
- #to_bibxml(bib) ⇒ Object
- #to_xml(bib) ⇒ Object
- #to_yaml(bib) ⇒ Object
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Returns a new instance of DataFetcher.
11 12 13 14 15 16 |
# File 'lib/relaton/jis/data_fetcher.rb', line 11 def initialize(output, format) super @queue = SizedQueue.new 10 @threads = create_thread_pool 5 @mutex = Mutex.new end |
Instance Method Details
#agent ⇒ Object
86 87 88 |
# File 'lib/relaton/jis/data_fetcher.rb', line 86 def agent @agent ||= Mechanize.new end |
#count ⇒ Object
119 120 121 |
# File 'lib/relaton/jis/data_fetcher.rb', line 119 def count @count.to_i end |
#create_thread_pool(size) ⇒ Object
38 39 40 41 42 43 44 45 46 |
# File 'lib/relaton/jis/data_fetcher.rb', line 38 def create_thread_pool(size) Array.new(size) do Thread.new do until (url = @queue.shift) == :END fetch_doc url end end end end |
#end_threads_and_wait ⇒ Object
113 114 115 116 117 |
# File 'lib/relaton/jis/data_fetcher.rb', line 113 def end_threads_and_wait @threads.size.times { @queue << :END } @queue.close @threads.each(&:join) end |
#fetch(_source = nil) ⇒ Object
66 67 68 69 70 71 72 73 |
# File 'lib/relaton/jis/data_fetcher.rb', line 66 def fetch(_source = nil) return unless initial_post resp = agent.get "#{URL}W11M0070/index" parse_page resp index.save report_errors end |
#fetch_doc(url) ⇒ Object
rubocop:disable Metrics/MethodLength
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/relaton/jis/data_fetcher.rb', line 48 def fetch_doc(url) # rubocop:disable Metrics/MethodLength attempts = 0 begin bib = Scraper.new(url, @errors).fetch rescue StandardError => e attempts += 1 if attempts < 5 sleep 2 retry else Util.warn "URL: #{url}" Util.warn "#{e.}\n#{e.backtrace[0..6].join("\n")}" end else save_doc bib, url end end |
#get_next_page(offset) ⇒ Object
rubocop:disable Metrics/MethodLength
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# File 'lib/relaton/jis/data_fetcher.rb', line 123 def get_next_page(offset) # rubocop:disable Metrics/MethodLength attempts = 0 begin if initial_post url = "#{URL}W11M0070/getAddList" agent.post url, search_type: "JIS", offset: offset end rescue StandardError => e attempts += 1 if attempts < 5 sleep 2 retry else Util.warn "#{e.}\n#{e.backtrace[0..6].join("\n")}" end end end |
#index ⇒ Object
22 23 24 |
# File 'lib/relaton/jis/data_fetcher.rb', line 22 def index @index ||= Relaton::Index.find_or_create :jis, file: "#{INDEXFILE}.yaml" end |
#initial_post ⇒ Object
75 76 77 78 79 80 81 82 83 84 |
# File 'lib/relaton/jis/data_fetcher.rb', line 75 def initial_post return true if @initial_time && Time.now - @initial_time < 600 body = { record: 0, dantai: "JIS", searchtype2: 1, status_1: 1, status_2: 2 } # rubocop:disable Naming/VariableNumber resp = agent.post "#{URL}W11M0270/index", body disp = JSON.parse resp.body @initial_time = Time.now disp["status"] || Util.warn("No results found for JIS") end |
#log_error(msg) ⇒ Object
18 19 20 |
# File 'lib/relaton/jis/data_fetcher.rb', line 18 def log_error(msg) Util.error msg end |
#parse_offset(resp) ⇒ Object
rubocop:disable Metrics/AbcSize
102 103 104 105 106 107 108 109 110 111 |
# File 'lib/relaton/jis/data_fetcher.rb', line 102 def parse_offset(resp) # rubocop:disable Metrics/AbcSize if resp.at('//*[@id="btnPaging"]') # first page xpath = '//script[contains(.,"var count =")]' @count = resp.at(xpath).text.match(/var count = (\d+);/)[1] resp.at("//*[@id='offset']")[:value].to_i else script = resp.at("//script").text script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i end end |
#parse_page(resp) ⇒ Object
90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/relaton/jis/data_fetcher.rb', line 90 def parse_page(resp) while resp xpath = '//div[@class="blockGenaral"]/a' resp.xpath(xpath).each { |a| @queue << a[:href] } offset = parse_offset resp break if offset >= count resp = get_next_page(offset) end end_threads_and_wait end |
#save_doc(bib, url) ⇒ Object
rubocop:disable Metrics/MethodLength
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/relaton/jis/data_fetcher.rb', line 141 def save_doc(bib, url) # rubocop:disable Metrics/MethodLength return unless bib id = bib.docidentifier.find(&:primary).content file = output_file id @mutex.synchronize do if @files.include?(file) Util.warn "File #{file} already exists. Duplication URL: #{url}" else @files << file File.write file, serialize(bib), encoding: "UTF-8" index.add_or_update id, file end end end |
#to_bibxml(bib) ⇒ Object
34 35 36 |
# File 'lib/relaton/jis/data_fetcher.rb', line 34 def to_bibxml(bib) bib.to_rfcxml end |
#to_xml(bib) ⇒ Object
30 31 32 |
# File 'lib/relaton/jis/data_fetcher.rb', line 30 def to_xml(bib) Bibdata.to_xml bib end |
#to_yaml(bib) ⇒ Object
26 27 28 |
# File 'lib/relaton/jis/data_fetcher.rb', line 26 def to_yaml(bib) Item.to_yaml bib end |