Class: Mindee::Input::Source::LocalInputSource
- Inherits:
-
Object
- Object
- Mindee::Input::Source::LocalInputSource
- Defined in:
- lib/mindee/input/sources/local_input_source.rb
Overview
Base class for loading documents.
Direct Known Subclasses
Base64InputSource, BytesInputSource, FileInputSource, PathInputSource
Instance Attribute Summary collapse
- #file_mimetype ⇒ String readonly
- #filename ⇒ String readonly
- #io_stream ⇒ StringIO | File readonly
Class Method Summary collapse
-
.fix_pdf(stream, maximum_offset: 500) ⇒ StringIO
Attempt to fix the PDF data in the given stream.
Instance Method Summary collapse
-
#apply_page_options(options) ⇒ Object
Cuts a PDF file according to provided options.
-
#compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) ⇒ Object
Compresses the file, according to the provided info.
-
#fix_pdf!(maximum_offset: 500) ⇒ void
Attempts to fix the PDF data in the file.
-
#initialize(io_stream, filename, repair_pdf: false) ⇒ LocalInputSource
constructor
A new instance of LocalInputSource.
-
#page_count ⇒ Integer
Returns the page count for a document.
-
#pdf? ⇒ Boolean
Shorthand for PDF mimetype validation.
-
#process_pdf(options) ⇒ Object
deprecated
Deprecated.
Use #apply_page_options instead.
-
#read_contents(close: true) ⇒ Array<>
Reads a document.
-
#rescue_broken_pdf(_) ⇒ Object
deprecated
Deprecated.
See #fix_pdf! or #self#self.fix_pdf instead.
-
#source_text? ⇒ bool
Checks whether the file has source text if it is a pdf.
-
#write_to_file(path) ⇒ Object
Write the file to a given path.
Constructor Details
#initialize(io_stream, filename, repair_pdf: false) ⇒ LocalInputSource
Returns a new instance of LocalInputSource.
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 38 def initialize(io_stream, filename, repair_pdf: false) @io_stream = io_stream @filename = filename @file_mimetype = detect_mime_type(repair_pdf) if ALLOWED_MIME_TYPES.include? @file_mimetype logger.debug("Loaded new input #{@filename} from #{self.class}") return end if filename.end_with?('.pdf') && repair_pdf fix_pdf! logger.debug("Loaded new input #{@filename} from #{self.class}") return if ALLOWED_MIME_TYPES.include? @file_mimetype end raise Error::MindeeMimeTypeError, @file_mimetype.to_s end |
Instance Attribute Details
#file_mimetype ⇒ String (readonly)
31 32 33 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 31 def file_mimetype @file_mimetype end |
#filename ⇒ String (readonly)
29 30 31 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 29 def filename @filename end |
#io_stream ⇒ StringIO | File (readonly)
33 34 35 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 33 def io_stream @io_stream end |
Class Method Details
.fix_pdf(stream, maximum_offset: 500) ⇒ StringIO
Attempt to fix the PDF data in the given stream.
82 83 84 85 86 87 88 89 90 91 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 82 def self.fix_pdf(stream, maximum_offset: 500) out_stream = StringIO.new(''.b) stream.gets('%PDF-') raise Error::MindeePDFError if stream.eof? || stream.pos > maximum_offset stream.pos = stream.pos - 5 out_stream.write(stream.read.to_s.b) out_stream.rewind out_stream end |
Instance Method Details
#apply_page_options(options) ⇒ Object
Cuts a PDF file according to provided options.
101 102 103 104 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 101 def () @io_stream.seek(0) @io_stream = PDF::PDFProcessor.parse(@io_stream, ) end |
#compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) ⇒ Object
Compresses the file, according to the provided info.
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 164 def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) unless Mindee::Dependencies.all_deps_available? raise NotImplementedError, Mindee::Dependencies::MINDEE_DEPENDENCIES_LOAD_ERROR end buffer = if pdf? Mindee::PDF::PDFCompressor.compress_pdf( @io_stream, quality: quality, force_source_text_compression: force_source_text, disable_source_text: disable_source_text ) else Mindee::Image::ImageCompressor.compress_image( @io_stream, quality: quality, max_width: max_width, max_height: max_height ) end @io_stream = buffer @io_stream.rewind end |
#fix_pdf!(maximum_offset: 500) ⇒ void
This method returns an undefined value.
Attempts to fix the PDF data in the file.
71 72 73 74 75 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 71 def fix_pdf!(maximum_offset: 500) @io_stream = LocalInputSource.fix_pdf(@io_stream, maximum_offset: maximum_offset) @io_stream.rewind @file_mimetype = Marcel::MimeType.for @io_stream end |
#page_count ⇒ Integer
Returns the page count for a document. Defaults to one for images.
144 145 146 147 148 149 150 151 152 153 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 144 def page_count unless Mindee::Dependencies.all_deps_available? raise NotImplementedError, Mindee::Dependencies::MINDEE_DEPENDENCIES_LOAD_ERROR end return 1 unless pdf? @io_stream.seek(0) pdf_processor = Mindee::PDF::PDFProcessor.open_pdf(@io_stream) pdf_processor.pages.size end |
#pdf? ⇒ Boolean
Shorthand for PDF mimetype validation.
63 64 65 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 63 def pdf? @file_mimetype.to_s == 'application/pdf' end |
#process_pdf(options) ⇒ Object
Use #apply_page_options instead.
108 109 110 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 108 def process_pdf() () end |
#read_contents(close: true) ⇒ Array<>
Reads a document.
115 116 117 118 119 120 121 122 123 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 115 def read_contents(close: true) logger.debug("Reading data from: #{@filename}") @io_stream.seek(0) # Avoids needlessly re-packing some files data = @io_stream.read @io_stream.rewind @io_stream.close if close [data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }] end |
#rescue_broken_pdf(_) ⇒ Object
See #fix_pdf! or Mindee::Input::Source::LocalInputSource#self#self.fix_pdf instead.
58 59 60 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 58 def rescue_broken_pdf(_) fix_pdf! end |
#source_text? ⇒ bool
Checks whether the file has source text if it is a pdf. false otherwise
190 191 192 193 194 195 196 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 190 def source_text? unless Mindee::Dependencies.all_deps_available? raise NotImplementedError, Mindee::Dependencies::MINDEE_DEPENDENCIES_LOAD_ERROR end Mindee::PDF::PDFTools.source_text?(@io_stream) end |
#write_to_file(path) ⇒ Object
Write the file to a given path. Uses the initial file name by default.
127 128 129 130 131 132 133 134 135 136 137 138 139 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 127 def write_to_file(path) t_path = if File.directory?(path || '') || path.to_s.end_with?('/') File.join(path || '', @filename) else path end full_path = File.(t_path || '') FileUtils.mkdir_p(File.dirname(full_path)) @io_stream.rewind File.binwrite(full_path, @io_stream.read || '') logger.debug("Wrote file successfully to #{full_path}") @io_stream.rewind end |