32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
# File 'lib/pdf2markdownOCR/llm_api.rb', line 32
def self.ocr_images(images)
markdown_pages = []
Pdf2MarkdownOCR.configuration.logger.info "OCR #{images.size} images"
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
hydra = Typhoeus::Hydra.new
images.each_with_index do |image_path, index|
payload = Pdf2MarkdownOCR::LlmApi.payload(image_path)
request = Typhoeus::Request.new(
"#{Pdf2MarkdownOCR.configuration.llm_api_url}/v1/chat/completions",
method: :post,
body: payload.to_json,
headers: { "Content-Type" => "application/json" },
timeout: 600 )
request.on_complete do |response|
if response.success?
parsed_response = JSON.parse(response.body)
markdown_page = parsed_response.dig("choices", 0, "message", "content") || ""
if markdown_page && !markdown_page.empty?
markdown_pages << { index: index, content: markdown_page }
else
Pdf2MarkdownOCR.configuration.logger.warn "Warning: No Markdown content generated for #{image_path}"
end
else
Pdf2MarkdownOCR.configuration.logger.error "Error processing #{image_path}: #{response.return_message} (#{response.code})"
end
end
hydra.queue(request)
end
hydra.run
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
Pdf2MarkdownOCR.configuration.logger.info "Total Image processing time: #{(t2 - t1).round(2)} seconds"
markdown_content = markdown_pages.sort_by { |page| page[:index] }.map { |page| page[:content] }.join("\n\n---\n\n")
markdown_content
end
|