Class: Relaton::Jis::DataFetcher

Inherits:
Core::DataFetcher
  • Object
show all
Defined in:
lib/relaton/jis/data_fetcher.rb

Constant Summary collapse

URL =
"https://webdesk.jsa.or.jp/books/"

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Returns a new instance of DataFetcher.



11
12
13
14
15
16
# File 'lib/relaton/jis/data_fetcher.rb', line 11

def initialize(output, format)
  super
  @queue = SizedQueue.new 10
  @threads = create_thread_pool 5
  @mutex = Mutex.new
end

Instance Method Details

#agentObject



86
87
88
# File 'lib/relaton/jis/data_fetcher.rb', line 86

def agent
  @agent ||= Mechanize.new
end

#countObject



119
120
121
# File 'lib/relaton/jis/data_fetcher.rb', line 119

def count
  @count.to_i
end

#create_thread_pool(size) ⇒ Object



38
39
40
41
42
43
44
45
46
# File 'lib/relaton/jis/data_fetcher.rb', line 38

def create_thread_pool(size)
  Array.new(size) do
    Thread.new do
      until (url = @queue.shift) == :END
        fetch_doc url
      end
    end
  end
end

#end_threads_and_waitObject



113
114
115
116
117
# File 'lib/relaton/jis/data_fetcher.rb', line 113

def end_threads_and_wait
  @threads.size.times { @queue << :END }
  @queue.close
  @threads.each(&:join)
end

#fetch(_source = nil) ⇒ Object



66
67
68
69
70
71
72
73
# File 'lib/relaton/jis/data_fetcher.rb', line 66

def fetch(_source = nil)
  return unless initial_post

  resp = agent.get "#{URL}W11M0070/index"
  parse_page resp
  index.save
  report_errors
end

#fetch_doc(url) ⇒ Object

rubocop:disable Metrics/MethodLength



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/relaton/jis/data_fetcher.rb', line 48

def fetch_doc(url) # rubocop:disable Metrics/MethodLength
  attempts = 0
  begin
    bib = Scraper.new(url, @errors).fetch
  rescue StandardError => e
    attempts += 1
    if attempts < 5
      sleep 2
      retry
    else
      Util.warn "URL: #{url}"
      Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
    end
  else
    save_doc bib, url
  end
end

#get_next_page(offset) ⇒ Object

rubocop:disable Metrics/MethodLength



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/relaton/jis/data_fetcher.rb', line 123

def get_next_page(offset) # rubocop:disable Metrics/MethodLength
  attempts = 0
  begin
    if initial_post
      url = "#{URL}W11M0070/getAddList"
      agent.post url, search_type: "JIS", offset: offset
    end
  rescue StandardError => e
    attempts += 1
    if attempts < 5
      sleep 2
      retry
    else
      Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
    end
  end
end

#indexObject



22
23
24
# File 'lib/relaton/jis/data_fetcher.rb', line 22

def index
  @index ||= Relaton::Index.find_or_create :jis, file: "#{INDEXFILE}.yaml"
end

#initial_postObject



75
76
77
78
79
80
81
82
83
84
# File 'lib/relaton/jis/data_fetcher.rb', line 75

def initial_post
  return true if @initial_time && Time.now - @initial_time < 600

  body = { record: 0, dantai: "JIS", searchtype2: 1,
           status_1: 1, status_2: 2 } # rubocop:disable Naming/VariableNumber
  resp = agent.post "#{URL}W11M0270/index", body
  disp = JSON.parse resp.body
  @initial_time = Time.now
  disp["status"] || Util.warn("No results found for JIS")
end

#log_error(msg) ⇒ Object



18
19
20
# File 'lib/relaton/jis/data_fetcher.rb', line 18

def log_error(msg)
  Util.error msg
end

#parse_offset(resp) ⇒ Object

rubocop:disable Metrics/AbcSize



102
103
104
105
106
107
108
109
110
111
# File 'lib/relaton/jis/data_fetcher.rb', line 102

def parse_offset(resp) # rubocop:disable Metrics/AbcSize
  if resp.at('//*[@id="btnPaging"]') # first page
    xpath = '//script[contains(.,"var count =")]'
    @count = resp.at(xpath).text.match(/var count = (\d+);/)[1]
    resp.at("//*[@id='offset']")[:value].to_i
  else
    script = resp.at("//script").text
    script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i
  end
end

#parse_page(resp) ⇒ Object



90
91
92
93
94
95
96
97
98
99
100
# File 'lib/relaton/jis/data_fetcher.rb', line 90

def parse_page(resp)
  while resp
    xpath = '//div[@class="blockGenaral"]/a'
    resp.xpath(xpath).each { |a| @queue << a[:href] }
    offset = parse_offset resp
    break if offset >= count

    resp = get_next_page(offset)
  end
  end_threads_and_wait
end

#save_doc(bib, url) ⇒ Object

rubocop:disable Metrics/MethodLength



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/relaton/jis/data_fetcher.rb', line 141

def save_doc(bib, url) # rubocop:disable Metrics/MethodLength
  return unless bib

  id = bib.docidentifier.find(&:primary).content
  file = output_file id
  @mutex.synchronize do
    if @files.include?(file)
      Util.warn "File #{file} already exists. Duplication URL: #{url}"
    else
      @files << file
      File.write file, serialize(bib), encoding: "UTF-8"
      index.add_or_update id, file
    end
  end
end

#to_bibxml(bib) ⇒ Object



34
35
36
# File 'lib/relaton/jis/data_fetcher.rb', line 34

def to_bibxml(bib)
  bib.to_rfcxml
end

#to_xml(bib) ⇒ Object



30
31
32
# File 'lib/relaton/jis/data_fetcher.rb', line 30

def to_xml(bib)
  Bibdata.to_xml bib
end

#to_yaml(bib) ⇒ Object



26
27
28
# File 'lib/relaton/jis/data_fetcher.rb', line 26

def to_yaml(bib)
  Item.to_yaml bib
end