Module: Pdf2MarkdownOCR::Pdf2Image
- Defined in:
- lib/pdf2markdownOCR/pdf2image.rb
Class Method Summary collapse
- .multi_thread_conversion(pdf_path, output_prefix) ⇒ Object
- .single_thread_conversion(pdf_path, output_prefix) ⇒ Object
Class Method Details
.multi_thread_conversion(pdf_path, output_prefix) ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# File 'lib/pdf2markdownOCR/pdf2image.rb', line 21 def self.multi_thread_conversion(pdf_path, output_prefix) Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}" # Get total page count info = Terrapin::CommandLine.new("pdfinfo", pdf_path).run total_pages = info.match(/^Pages:\s+(\d+)/)[1].to_i Pdf2MarkdownOCR.configuration.logger.info "Total pages: #{total_pages}" t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC) threads = (1..total_pages).map do |page| Thread.new do Terrapin::CommandLine.new( "pdftoppm", "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} -f #{page} -l #{page} #{pdf_path} #{output_prefix}/pdf2ocr" ).run Pdf2MarkdownOCR.configuration.logger.info "Converted page #{page}/#{total_pages}" end end threads.each(&:join) t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC) Pdf2MarkdownOCR.configuration.logger.info "PDF to image conversion time: #{(t2 - t1).round(2)} seconds" Dir.glob("#{output_prefix}/pdf2ocr*.png").sort end |
.single_thread_conversion(pdf_path, output_prefix) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/pdf2markdownOCR/pdf2image.rb', line 6 def self.single_thread_conversion(pdf_path, output_prefix) Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}" t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC) line = Terrapin::CommandLine.new( "pdftoppm", "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} #{pdf_path} #{output_prefix}/pdf2ocr" ) line.run t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC) Pdf2MarkdownOCR.configuration.logger.info "PDF to image conversion time: #{(t2 - t1).round(2)} seconds" Dir.glob("#{output_prefix}/pdf2ocr*.png").sort end |