Class: Relaton::Jis::DataFetcher

Inherits:
Core::DataFetcher
  • Object
show all
Defined in:
lib/relaton/jis/data_fetcher.rb

Constant Summary collapse

URL =
"https://webdesk.jsa.or.jp/books/"

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Returns a new instance of DataFetcher.



11
12
13
14
15
16
# File 'lib/relaton/jis/data_fetcher.rb', line 11

def initialize(output, format)
  super
  @queue = SizedQueue.new 10
  @threads = create_thread_pool 5
  @mutex = Mutex.new
end

Instance Method Details

#agentObject



89
90
91
# File 'lib/relaton/jis/data_fetcher.rb', line 89

def agent
  @agent ||= Mechanize.new
end

#countObject



122
123
124
# File 'lib/relaton/jis/data_fetcher.rb', line 122

def count
  @count.to_i
end

#create_thread_pool(size) ⇒ Object



42
43
44
45
46
47
48
49
50
# File 'lib/relaton/jis/data_fetcher.rb', line 42

def create_thread_pool(size)
  Array.new(size) do
    Thread.new do
      until (url = @queue.shift) == :END
        fetch_doc url
      end
    end
  end
end

#end_threads_and_waitObject



116
117
118
119
120
# File 'lib/relaton/jis/data_fetcher.rb', line 116

def end_threads_and_wait
  @threads.size.times { @queue << :END }
  @queue.close
  @threads.each(&:join)
end

#fetch(_source = nil) ⇒ Object



70
71
72
73
74
75
76
# File 'lib/relaton/jis/data_fetcher.rb', line 70

def fetch(_source = nil)
  return unless initial_post

  resp = agent.get "#{URL}W11M0070/index"
  parse_page resp
  index.save
end

#fetch_doc(url) ⇒ Object

rubocop:disable Metrics/MethodLength



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/relaton/jis/data_fetcher.rb', line 52

def fetch_doc(url) # rubocop:disable Metrics/MethodLength
  attempts = 0
  begin
    bib = Scraper.new(url).fetch
  rescue StandardError => e
    attempts += 1
    if attempts < 5
      sleep 2
      retry
    else
      Util.warn "URL: #{url}"
      Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
    end
  else
    save_doc bib, url
  end
end

#get_next_page(offset) ⇒ Object

rubocop:disable Metrics/MethodLength



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/relaton/jis/data_fetcher.rb', line 126

def get_next_page(offset) # rubocop:disable Metrics/MethodLength
  attempts = 0
  begin
    if initial_post
      url = "#{URL}W11M0070/getAddList"
      agent.post url, search_type: "JIS", offset: offset
    end
  rescue StandardError => e
    attempts += 1
    if attempts < 5
      sleep 2
      retry
    else
      Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
    end
  end
end

#gh_issue_channelObject



18
19
20
# File 'lib/relaton/jis/data_fetcher.rb', line 18

def gh_issue_channel
  ["relaton/relaton-jis", "Error fetching JIS documents"]
end

#indexObject



26
27
28
# File 'lib/relaton/jis/data_fetcher.rb', line 26

def index
  @index ||= Relaton::Index.find_or_create :jis, file: "#{INDEXFILE}.yaml"
end

#initial_postObject



78
79
80
81
82
83
84
85
86
87
# File 'lib/relaton/jis/data_fetcher.rb', line 78

def initial_post
  return true if @initial_time && Time.now - @initial_time < 600

  body = { record: 0, dantai: "JIS", searchtype2: 1,
           status_1: 1, status_2: 2 } # rubocop:disable Naming/VariableNumber
  resp = agent.post "#{URL}W11M0270/index", body
  disp = JSON.parse resp.body
  @initial_time = Time.now
  disp["status"] || Util.warn("No results found for JIS")
end

#log_error(msg) ⇒ Object



22
23
24
# File 'lib/relaton/jis/data_fetcher.rb', line 22

def log_error(msg)
  Util.error msg
end

#parse_offset(resp) ⇒ Object

rubocop:disable Metrics/AbcSize



105
106
107
108
109
110
111
112
113
114
# File 'lib/relaton/jis/data_fetcher.rb', line 105

def parse_offset(resp) # rubocop:disable Metrics/AbcSize
  if resp.at('//*[@id="btnPaging"]') # first page
    xpath = '//script[contains(.,"var count =")]'
    @count = resp.at(xpath).text.match(/var count = (\d+);/)[1]
    resp.at("//*[@id='offset']")[:value].to_i
  else
    script = resp.at("//script").text
    script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i
  end
end

#parse_page(resp) ⇒ Object



93
94
95
96
97
98
99
100
101
102
103
# File 'lib/relaton/jis/data_fetcher.rb', line 93

def parse_page(resp)
  while resp
    xpath = '//div[@class="blockGenaral"]/a'
    resp.xpath(xpath).each { |a| @queue << a[:href] }
    offset = parse_offset resp
    break if offset >= count

    resp = get_next_page(offset)
  end
  end_threads_and_wait
end

#save_doc(bib, url) ⇒ Object

rubocop:disable Metrics/MethodLength



144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/relaton/jis/data_fetcher.rb', line 144

def save_doc(bib, url) # rubocop:disable Metrics/MethodLength
  return unless bib

  id = bib.docidentifier.find(&:primary).content
  file = output_file id
  @mutex.synchronize do
    if @files.include?(file)
      Util.warn "File #{file} already exists. Duplication URL: #{url}"
    else
      @files << file
      File.write file, serialize(bib), encoding: "UTF-8"
      index.add_or_update id, file
    end
  end
end

#to_bibxml(bib) ⇒ Object



38
39
40
# File 'lib/relaton/jis/data_fetcher.rb', line 38

def to_bibxml(bib)
  bib.to_rfcxml
end

#to_xml(bib) ⇒ Object



34
35
36
# File 'lib/relaton/jis/data_fetcher.rb', line 34

def to_xml(bib)
  Bibdata.to_xml bib
end

#to_yaml(bib) ⇒ Object



30
31
32
# File 'lib/relaton/jis/data_fetcher.rb', line 30

def to_yaml(bib)
  Item.to_yaml bib
end