Module: Pdf2MarkdownOCR::Pdf2Image

Defined in:
lib/pdf2markdownOCR/pdf2image.rb

Class Method Summary collapse

Class Method Details

.multi_thread_conversion(pdf_path:, output_prefix:, pages: nil) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/pdf2markdownOCR/pdf2image.rb', line 46

def self.multi_thread_conversion(pdf_path:, output_prefix:, pages: nil)

  Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}"
 
  pages_to_process = []
  if pages.nil?
    info = Terrapin::CommandLine.new("pdfinfo", pdf_path).run
    total_pages = info.match(/^Pages:\s+(\d+)/)[1].to_i
    pages_to_process = (1..total_pages).to_a
  else
    pages.split(",").each do |page_range|
      if page_range.include?("-")
        start_page, end_page = page_range.split("-").map(&:to_i)
        pages_to_process += (start_page..end_page).to_a
      else
        pages_to_process << page_range.to_i
      end
    end
  end
  
  Pdf2MarkdownOCR.configuration.logger.info "Total pages to process: #{pages_to_process.size}"
  t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  threads = pages_to_process.map do |page|
    Thread.new do
      Terrapin::CommandLine.new(
        "pdftoppm",
        "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} -f #{page} -l #{page} #{pdf_path} #{output_prefix}/pdf2ocr"
      ).run

      Pdf2MarkdownOCR.configuration.logger.info "Converted page #{page}/#{pages_to_process.size}"
    end
  end

  threads.each(&:join)
  t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  Pdf2MarkdownOCR.configuration.logger.info "PDF to image conversion time: #{(t2 - t1).round(2)} seconds"

  Dir.glob("#{output_prefix}/pdf2ocr*.png").sort
end

.single_thread_conversion(pdf_path:, output_prefix:, pages: nil) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/pdf2markdownOCR/pdf2image.rb', line 7

def self.single_thread_conversion(pdf_path:, output_prefix:, pages: nil)
  Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}"
  
  t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)

  if pages.nil?
    line = Terrapin::CommandLine.new(
      "pdftoppm",
      "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} #{pdf_path} #{output_prefix}/pdf2ocr"
    )
    line.run
  else
    pages.split(",").each do |page_range|
      if page_range.include?("-")
        start_page, end_page = page_range.split("-").map(&:to_i)
        line = Terrapin::CommandLine.new(
          "pdftoppm",
          "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}" +
          " -f #{start_page} -l #{end_page}" + 
          " #{pdf_path} #{output_prefix}/pdf2ocr"
        )
        line.run
      else
        line = Terrapin::CommandLine.new(
          "pdftoppm",
          "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}" +
          " -f #{page_range} -l #{page_range}" + 
          " #{pdf_path} #{output_prefix}/pdf2ocr"
        )
        line.run
      end
    end
  end

  t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  Pdf2MarkdownOCR.configuration.logger.info "PDF to image conversion time: #{(t2 - t1).round(2)} seconds"
  Dir.glob("#{output_prefix}/pdf2ocr*.png").sort
end