Module: Pdf2MarkdownOCR::Pdf2Image

Defined in:
lib/pdf2markdownOCR/pdf2image.rb

Class Method Summary collapse

Class Method Details

.multi_thread_conversion(pdf_path, output_prefix) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/pdf2markdownOCR/pdf2image.rb', line 21

def self.multi_thread_conversion(pdf_path, output_prefix)

  Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}"

  # Get total page count
  info = Terrapin::CommandLine.new("pdfinfo", pdf_path).run
  total_pages = info.match(/^Pages:\s+(\d+)/)[1].to_i
  
  Pdf2MarkdownOCR.configuration.logger.info "Total pages: #{total_pages}"
  t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  threads = (1..total_pages).map do |page|
    Thread.new do
      Terrapin::CommandLine.new(
        "pdftoppm",
        "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} -f #{page} -l #{page} #{pdf_path} #{output_prefix}/pdf2ocr"
      ).run

      Pdf2MarkdownOCR.configuration.logger.info "Converted page #{page}/#{total_pages}"
    end
  end

  threads.each(&:join)
  t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  Pdf2MarkdownOCR.configuration.logger.info "PDF to image conversion time: #{(t2 - t1).round(2)} seconds"

  Dir.glob("#{output_prefix}/pdf2ocr*.png").sort
end

.single_thread_conversion(pdf_path, output_prefix) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/pdf2markdownOCR/pdf2image.rb', line 6

def self.single_thread_conversion(pdf_path, output_prefix)
  Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}"
  
  t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  line = Terrapin::CommandLine.new(
    "pdftoppm",
    "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} #{pdf_path} #{output_prefix}/pdf2ocr"
  )
  line.run

  t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  Pdf2MarkdownOCR.configuration.logger.info "PDF to image conversion time: #{(t2 - t1).round(2)} seconds"
  Dir.glob("#{output_prefix}/pdf2ocr*.png").sort
end