Module: Pdf2MarkdownOCR

Defined in:
lib/pdf2markdownOCR.rb,
lib/pdf2markdownOCR/cli.rb,
lib/pdf2markdownOCR/llm_api.rb,
lib/pdf2markdownOCR/version.rb,
lib/pdf2markdownOCR/pdf2image.rb,
lib/pdf2markdownOCR/configuration.rb

Defined Under Namespace

Modules: LlmApi, Pdf2Image Classes: CLI, Configuration, FileNotFoundError

Constant Summary collapse

VERSION =
"0.0.2"

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.configurationObject



20
21
22
# File 'lib/pdf2markdownOCR.rb', line 20

def configuration
  @configuration ||= Configuration.new
end

Class Method Details

.configure {|configuration| ... } ⇒ Object

Yields:



24
25
26
# File 'lib/pdf2markdownOCR.rb', line 24

def configure
  yield(configuration)
end

.convert_pdf(pdf_path, output_file = nil) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/pdf2markdownOCR.rb', line 30

def self.convert_pdf(pdf_path, output_file = nil)
  
  Pdf2MarkdownOCR.configuration.logger.info "Parsing PDF file: #{pdf_path}"
  unless File.exist?(pdf_path)
    Pdf2MarkdownOCR.configuration.logger.error "File not found: #{pdf_path}"
    raise FileNotFoundError, "File not found: #{pdf_path}"
  end

  markdown_content = ""
  begin
    tempdir = Dir.mktmpdir

    images = []
    if Pdf2MarkdownOCR.configuration.mode == :multi_thread
      images = Pdf2MarkdownOCR::Pdf2Image.multi_thread_conversion(pdf_path, tempdir)
    else
      images = Pdf2MarkdownOCR::Pdf2Image.single_thread_conversion(pdf_path, tempdir)
    end

    markdown_content = Pdf2MarkdownOCR::LlmApi.ocr_images(images)
  ensure
    # Clean up temporary directory after processing
    FileUtils.remove_entry(tempdir) if tempdir && Dir.exist?(tempdir)
  end

  # If output file is configured, write the markdown content to the file, otherwise return it as a string
  if output_file && !output_file.empty?
    File.write(output_file, markdown_content)
    return nil
  end
  markdown_content
end

.gem_versionObject



6
7
8
# File 'lib/pdf2markdownOCR/version.rb', line 6

def self.gem_version
  Gem::Version.new(VERSION)
end