Class: Mindee::Input::Source::LocalInputSource
- Inherits:
-
Object
- Object
- Mindee::Input::Source::LocalInputSource
- Defined in:
- lib/mindee/input/sources/local_input_source.rb
Overview
Base class for loading documents.
Direct Known Subclasses
Base64InputSource, BytesInputSource, FileInputSource, PathInputSource
Instance Attribute Summary collapse
- #file_mimetype ⇒ String readonly
- #filename ⇒ String readonly
- #io_stream ⇒ StringIO | File readonly
Class Method Summary collapse
-
.fix_pdf(stream, maximum_offset: 500) ⇒ StringIO
Attempt to fix the PDF data in the given stream.
Instance Method Summary collapse
-
#apply_page_options(options) ⇒ Object
Cuts a PDF file according to provided options.
-
#compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) ⇒ Object
Compresses the file, according to the provided info.
-
#fix_pdf!(maximum_offset: 500) ⇒ void
Attempts to fix the PDF data in the file.
-
#initialize(io_stream, filename, repair_pdf: false) ⇒ LocalInputSource
constructor
A new instance of LocalInputSource.
-
#page_count ⇒ Integer
Returns the page count for a document.
-
#pdf? ⇒ Boolean
Shorthand for PDF mimetype validation.
-
#process_pdf(options) ⇒ Object
deprecated
Deprecated.
Use #apply_page_options instead.
-
#read_contents(close: true) ⇒ Array<>
Reads a document.
-
#rescue_broken_pdf(_) ⇒ Object
deprecated
Deprecated.
See #fix_pdf! or #self#self.fix_pdf instead.
-
#source_text? ⇒ bool
Checks whether the file has source text if it is a pdf.
-
#write_to_file(path) ⇒ Object
Write the file to a given path.
Constructor Details
#initialize(io_stream, filename, repair_pdf: false) ⇒ LocalInputSource
Returns a new instance of LocalInputSource.
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 37 def initialize(io_stream, filename, repair_pdf: false) @io_stream = io_stream @filename = filename @file_mimetype = if repair_pdf Marcel::MimeType.for @io_stream else Marcel::MimeType.for @io_stream, name: @filename end if ALLOWED_MIME_TYPES.include? @file_mimetype logger.debug("Loaded new input #{@filename} from #{self.class}") return end if filename.end_with?('.pdf') && repair_pdf fix_pdf! logger.debug("Loaded new input #{@filename} from #{self.class}") return if ALLOWED_MIME_TYPES.include? @file_mimetype end raise Error::MindeeMimeTypeError, @file_mimetype.to_s end |
Instance Attribute Details
#file_mimetype ⇒ String (readonly)
30 31 32 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 30 def file_mimetype @file_mimetype end |
#filename ⇒ String (readonly)
28 29 30 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 28 def filename @filename end |
#io_stream ⇒ StringIO | File (readonly)
32 33 34 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 32 def io_stream @io_stream end |
Class Method Details
.fix_pdf(stream, maximum_offset: 500) ⇒ StringIO
Attempt to fix the PDF data in the given stream.
85 86 87 88 89 90 91 92 93 94 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 85 def self.fix_pdf(stream, maximum_offset: 500) out_stream = StringIO.new(''.b) stream.gets('%PDF-') raise Error::MindeePDFError if stream.eof? || stream.pos > maximum_offset stream.pos = stream.pos - 5 out_stream.write(stream.read.to_s.b) out_stream.rewind out_stream end |
Instance Method Details
#apply_page_options(options) ⇒ Object
Cuts a PDF file according to provided options.
104 105 106 107 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 104 def () @io_stream.seek(0) @io_stream = PDF::PDFProcessor.parse(@io_stream, ) end |
#compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) ⇒ Object
Compresses the file, according to the provided info.
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 167 def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) unless Mindee::Dependencies.all_deps_available? raise NotImplementedError, Mindee::Dependencies::MINDEE_DEPENDENCIES_LOAD_ERROR end buffer = if pdf? Mindee::PDF::PDFCompressor.compress_pdf( @io_stream, quality: quality, force_source_text_compression: force_source_text, disable_source_text: disable_source_text ) else Mindee::Image::ImageCompressor.compress_image( @io_stream, quality: quality, max_width: max_width, max_height: max_height ) end @io_stream = buffer @io_stream.rewind end |
#fix_pdf!(maximum_offset: 500) ⇒ void
This method returns an undefined value.
Attempts to fix the PDF data in the file.
74 75 76 77 78 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 74 def fix_pdf!(maximum_offset: 500) @io_stream = LocalInputSource.fix_pdf(@io_stream, maximum_offset: maximum_offset) @io_stream.rewind @file_mimetype = Marcel::MimeType.for @io_stream end |
#page_count ⇒ Integer
Returns the page count for a document. Defaults to one for images.
147 148 149 150 151 152 153 154 155 156 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 147 def page_count unless Mindee::Dependencies.all_deps_available? raise NotImplementedError, Mindee::Dependencies::MINDEE_DEPENDENCIES_LOAD_ERROR end return 1 unless pdf? @io_stream.seek(0) pdf_processor = Mindee::PDF::PDFProcessor.open_pdf(@io_stream) pdf_processor.pages.size end |
#pdf? ⇒ Boolean
Shorthand for PDF mimetype validation.
66 67 68 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 66 def pdf? @file_mimetype.to_s == 'application/pdf' end |
#process_pdf(options) ⇒ Object
Use #apply_page_options instead.
111 112 113 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 111 def process_pdf() () end |
#read_contents(close: true) ⇒ Array<>
Reads a document.
118 119 120 121 122 123 124 125 126 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 118 def read_contents(close: true) logger.debug("Reading data from: #{@filename}") @io_stream.seek(0) # Avoids needlessly re-packing some files data = @io_stream.read @io_stream.rewind @io_stream.close if close [data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }] end |
#rescue_broken_pdf(_) ⇒ Object
See #fix_pdf! or Mindee::Input::Source::LocalInputSource#self#self.fix_pdf instead.
61 62 63 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 61 def rescue_broken_pdf(_) fix_pdf! end |
#source_text? ⇒ bool
Checks whether the file has source text if it is a pdf. false otherwise
193 194 195 196 197 198 199 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 193 def source_text? unless Mindee::Dependencies.all_deps_available? raise NotImplementedError, Mindee::Dependencies::MINDEE_DEPENDENCIES_LOAD_ERROR end Mindee::PDF::PDFTools.source_text?(@io_stream) end |
#write_to_file(path) ⇒ Object
Write the file to a given path. Uses the initial file name by default.
130 131 132 133 134 135 136 137 138 139 140 141 142 |
# File 'lib/mindee/input/sources/local_input_source.rb', line 130 def write_to_file(path) t_path = if File.directory?(path || '') || path.to_s.end_with?('/') File.join(path || '', @filename) else path end full_path = File.(t_path || '') FileUtils.mkdir_p(File.dirname(full_path)) @io_stream.rewind File.binwrite(full_path, @io_stream.read || '') logger.debug("Wrote file successfully to #{full_path}") @io_stream.rewind end |