Class: Markdownator::Converters::Pdf

Inherits:
Base
  • Object
show all
Defined in:
lib/markdownator/converters/pdf.rb

Overview

Extracts text from a PDF (one block per page) using the ‘pdf-reader` gem.

Instance Method Summary collapse

Instance Method Details

#accepts?(io, stream_info) ⇒ Boolean

Returns:

  • (Boolean)


7
8
9
10
11
# File 'lib/markdownator/converters/pdf.rb', line 7

def accepts?(io, stream_info)
  return true if matches?(stream_info, extensions: %w[pdf], mimetypes: %w[application/pdf])

  magic_pdf?(io)
end

#convert(io, _stream_info, **_options) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/markdownator/converters/pdf.rb', line 13

def convert(io, _stream_info, **_options)
  Markdownator.require_optional("pdf-reader", feature: "PDF conversion")

  reader = PDF::Reader.new(io)
  pages = reader.pages.map { |page| page.text.strip }
  pages.reject!(&:empty?)
  Result.new(
    markdown: pages.join("\n\n---\n\n"),
    metadata: { page_count: reader.page_count }
  )
rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
  raise FileConversionError, "Could not read PDF: #{e.message}"
end