9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
# File 'lib/pdf2markdownOCR/cli.rb', line 9
def self.run(argv = ARGV)
options = {
}
parser = OptionParser.new do |opts|
opts.banner = "Usage: pdf2markdownocr [options] <pdf_path>"
opts.separator ""
opts.separator "Converts a PDF file to Markdown using OCR. If no output file is specified, the Markdown content will be printed to STDOUT."
opts.separator ""
opts.separator "Options:"
opts.on("-o", "--output FILE", "Output Markdown file") do |file|
options[:output] = file
end
opts.on("--llm-api-url URL", "OpenAI compatible server URL (default: http://localhost:8000)") do |url|
options[:llm_api_url] = url
end
opts.on("--llm-model MODEL", "LLM model to use (default: deepseek-ai/DeepSeek-OCR-2)") do |model|
options[:llm_model] = model
end
opts.on("--mode MODE", "Processing mode: single_thread or multi_thread (default: multi_thread)") do |mode|
options[:mode] = mode
end
opts.on("--png-dpi DPI", Integer, "DPI resolution for PNG conversion (default: 300)") do |dpi|
options[:png_dpi] = dpi
end
opts.on("--pages PAGES", "Pages to process, separated by commas. Ranges can be provided with a dash (e.g., 1,3-5)") do |pages|
options[:pages] = pages
end
opts.on("-v", "--version", "Print version") do
puts Pdf2MarkdownOCR.gem_version
exit
end
opts.on("-h", "--help", "Show this help message") do
puts opts
exit
end
end
begin
parser.parse!(argv)
rescue OptionParser::InvalidOption => e
abort "Error: #{e.message}\n\n#{parser}"
end
pdf_path = argv.shift
if pdf_path.nil? || pdf_path.empty?
abort "Error: no PDF file specified.\n\n#{parser}"
end
Pdf2MarkdownOCR.configure do |config|
config.llm_api_url = options[:llm_api_url] if options[:llm_api_url]
config.llm_model = options[:llm_model] if options[:llm_model]
config.mode = options[:mode] if options[:mode]
config.png_dpi_resolution = options[:png_dpi] if options[:png_dpi]
end
markdown_content = Pdf2MarkdownOCR.convert_pdf(pdf_path: pdf_path,
output_file: options[:output],
pages: options[:pages])
if markdown_content && !markdown_content.empty? && !options[:output]
puts markdown_content
end
end
|