Class: Relaton::Jis::DataFetcher

Inherits:
Core::DataFetcher
  • Object
show all
Defined in:
lib/relaton/jis/data_fetcher.rb

Constant Summary collapse

URL =
"https://webdesk.jsa.or.jp/books/"

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Returns a new instance of DataFetcher.



11
12
13
14
15
16
# File 'lib/relaton/jis/data_fetcher.rb', line 11

def initialize(output, format)
  super
  @queue = SizedQueue.new 10
  @threads = create_thread_pool 5
  @mutex = Mutex.new
end

Instance Method Details

#agentObject



107
108
109
# File 'lib/relaton/jis/data_fetcher.rb', line 107

def agent
  @agent ||= Mechanize.new
end

#countObject



140
141
142
# File 'lib/relaton/jis/data_fetcher.rb', line 140

def count
  @count.to_i
end

#create_thread_pool(size) ⇒ Object



58
59
60
61
62
63
64
65
66
# File 'lib/relaton/jis/data_fetcher.rb', line 58

def create_thread_pool(size)
  Array.new(size) do
    Thread.new do
      until (url = @queue.shift) == :END
        fetch_doc url
      end
    end
  end
end

#end_threads_and_waitObject



134
135
136
137
138
# File 'lib/relaton/jis/data_fetcher.rb', line 134

def end_threads_and_wait
  @threads.size.times { @queue << :END }
  @queue.close
  @threads.each(&:join)
end

#fetch(_source = nil) ⇒ Object



86
87
88
89
90
91
92
93
94
# File 'lib/relaton/jis/data_fetcher.rb', line 86

def fetch(_source = nil)
  return unless initial_post

  resp = agent.get "#{URL}W11M0070/index"
  parse_page resp
  index.save
  index_v2.save
  report_errors
end

#fetch_doc(url) ⇒ Object

rubocop:disable Metrics/MethodLength



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/relaton/jis/data_fetcher.rb', line 68

def fetch_doc(url) # rubocop:disable Metrics/MethodLength
  attempts = 0
  begin
    bib = Scraper.new(url, @errors).fetch
  rescue StandardError => e
    attempts += 1
    if attempts < 5
      sleep 2
      retry
    else
      Util.warn "URL: #{url}"
      Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
    end
  else
    save_doc bib, url
  end
end

#get_next_page(offset) ⇒ Object

rubocop:disable Metrics/MethodLength



144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/relaton/jis/data_fetcher.rb', line 144

def get_next_page(offset) # rubocop:disable Metrics/MethodLength
  attempts = 0
  begin
    if initial_post
      url = "#{URL}W11M0070/getAddList"
      agent.post url, search_type: "JIS", offset: offset
    end
  rescue StandardError => e
    attempts += 1
    if attempts < 5
      sleep 2
      retry
    else
      Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
    end
  end
end

#indexObject



22
23
24
# File 'lib/relaton/jis/data_fetcher.rb', line 22

def index
  @index ||= Relaton::Index.find_or_create :jis, file: "#{INDEXFILE}.yaml"
end

#index_v2Object

Pubid-based index built in parallel with the legacy string index. The pool keys by type, so requesting a second :jis index with a different file evicts the v1 Type from the pool, but we keep our own reference in @index, so both stay live for the duration of the crawl.



30
31
32
33
34
# File 'lib/relaton/jis/data_fetcher.rb', line 30

def index_v2
  @index_v2 ||= Relaton::Index.find_or_create(
    :jis, file: "#{INDEXFILE_V2}.yaml", pubid_class: ::Pubid::Jis::Identifier
  )
end

#initial_postObject



96
97
98
99
100
101
102
103
104
105
# File 'lib/relaton/jis/data_fetcher.rb', line 96

def initial_post
  return true if @initial_time && Time.now - @initial_time < 600

  body = { record: 0, dantai: "JIS", searchtype2: 1,
           status_1: 1, status_2: 2 } # rubocop:disable Naming/VariableNumber
  resp = agent.post "#{URL}W11M0270/index", body
  disp = JSON.parse resp.body
  @initial_time = Time.now
  disp["status"] || Util.warn("No results found for JIS")
end

#log_error(msg) ⇒ Object



18
19
20
# File 'lib/relaton/jis/data_fetcher.rb', line 18

def log_error(msg)
  Util.error msg
end

#parse_offset(resp) ⇒ Object

rubocop:disable Metrics/AbcSize



123
124
125
126
127
128
129
130
131
132
# File 'lib/relaton/jis/data_fetcher.rb', line 123

def parse_offset(resp) # rubocop:disable Metrics/AbcSize
  if resp.at('//*[@id="btnPaging"]') # first page
    xpath = '//script[contains(.,"var count =")]'
    @count = resp.at(xpath).text.match(/var count = (\d+);/)[1]
    resp.at("//*[@id='offset']")[:value].to_i
  else
    script = resp.at("//script").text
    script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i
  end
end

#parse_page(resp) ⇒ Object



111
112
113
114
115
116
117
118
119
120
121
# File 'lib/relaton/jis/data_fetcher.rb', line 111

def parse_page(resp)
  while resp
    xpath = '//div[@class="blockGenaral"]/a'
    resp.xpath(xpath).each { |a| @queue << a[:href] }
    offset = parse_offset resp
    break if offset >= count

    resp = get_next_page(offset)
  end
  end_threads_and_wait
end

#pubid(id) ⇒ Object

Parse a primary docidentifier string into a pubid identifier; nil (with a warning) if pubid can’t parse it, so a single bad id never aborts the crawl or corrupts index-v2.



39
40
41
42
43
44
# File 'lib/relaton/jis/data_fetcher.rb', line 39

def pubid(id)
  ::Pubid::Jis::Identifier.parse id
rescue StandardError => e
  Util.warn "Failed to parse `#{id}` with pubid: #{e.message}"
  nil
end

#save_doc(bib, url) ⇒ Object

rubocop:disable Metrics/MethodLength



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/relaton/jis/data_fetcher.rb', line 162

def save_doc(bib, url) # rubocop:disable Metrics/MethodLength
  return unless bib

  id = bib.docidentifier.find(&:primary).content
  file = output_file id
  @mutex.synchronize do
    if @files.include?(file)
      Util.warn "File #{file} already exists. Duplication URL: #{url}"
    else
      @files << file
      File.write file, serialize(bib), encoding: "UTF-8"
      index.add_or_update id, file
      pid = pubid id
      index_v2.add_or_update pid, file if pid
    end
  end
end

#to_bibxml(bib) ⇒ Object



54
55
56
# File 'lib/relaton/jis/data_fetcher.rb', line 54

def to_bibxml(bib)
  bib.to_rfcxml
end

#to_xml(bib) ⇒ Object



50
51
52
# File 'lib/relaton/jis/data_fetcher.rb', line 50

def to_xml(bib)
  Bibdata.to_xml bib
end

#to_yaml(bib) ⇒ Object



46
47
48
# File 'lib/relaton/jis/data_fetcher.rb', line 46

def to_yaml(bib)
  Item.to_yaml bib
end