Class: ChupaText::Data
- Inherits:
-
Object
- Object
- ChupaText::Data
- Defined in:
- lib/chupa-text/data.rb
Direct Known Subclasses
Instance Attribute Summary collapse
-
#attributes ⇒ Attributes
readonly
The attributes of the data.
-
#body ⇒ String?
The content of the data,
nil
if the data doesn’t have any content. -
#expected_screenshot_size ⇒ Array<Integer, Integer>
The expected screenshot size.
-
#limit_as ⇒ Numeric, ...
The max memory on extraction by external command.
-
#limit_cpu ⇒ Numeric, ...
The max CPU time on extraction by external command.
-
#max_body_size ⇒ Integer?
The max body size in bytes.
-
#need_screenshot ⇒ Bool
writeonly
The specified value.
-
#path ⇒ String?
The path associated with the content of the data,
nil
if the data doesn’t associated with any file. -
#screenshot ⇒ Screenshot?
The screenshot of the data.
-
#size ⇒ Integer?
The byte size of the data,
nil
if the data doesn’t have any content. -
#source ⇒ Data?
The source of the data.
-
#timeout ⇒ Numeric, ...
The timeout on extraction.
-
#uri ⇒ URI?
The URI of the data if the data is for remote or local file,
nil
if the data isn’t associated with any URIs.
Instance Method Summary collapse
-
#[](name) ⇒ Object
-
#[]=(name, value) ⇒ Object
-
#extension ⇒ String?
Normalized extension as String if #uri is not
nil
,nil
otherwise. -
#initialize(options = {}) ⇒ Data
constructor
A new instance of Data.
-
#initialize_copy(object) ⇒ Object
-
#merge!(data) ⇒ void
Merges metadata from data.
-
#mime_type ⇒ String?
-
#mime_type=(type) ⇒ Object
-
#need_screenshot? ⇒ Bool
true
when screenshot is needed if available. -
#open {|StringIO.new(body)| ... } ⇒ Object
-
#peek_body(size) ⇒ Object
-
#release ⇒ Object
-
#text? ⇒ Bool
True if MIME type is “text/XXX”, false otherwise.
-
#text_plain? ⇒ Bool
True if MIME type is “text/plain”, false otherwise.
-
#to_utf8_body_data ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ Data
Returns a new instance of Data.
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/chupa-text/data.rb', line 82 def initialize(={}) @uri = nil @body = nil @size = nil @path = nil @mime_type = nil @attributes = Attributes.new @source = nil @screenshot = nil @need_screenshot = true @expected_screenshot_size = [200, 200] @max_body_size = nil @timeout = nil @limit_cpu = nil @limit_as = nil @options = || {} source_data = @options[:source_data] if source_data merge!(source_data) @source = source_data end end |
Instance Attribute Details
#attributes ⇒ Attributes (readonly)
Returns The attributes of the data.
50 51 52 |
# File 'lib/chupa-text/data.rb', line 50 def attributes @attributes end |
#body ⇒ String?
Returns The content of the data, nil
if the data
doesn’t have any content.
32 33 34 |
# File 'lib/chupa-text/data.rb', line 32 def body @body end |
#expected_screenshot_size ⇒ Array<Integer, Integer>
Returns the expected screenshot size.
66 67 68 |
# File 'lib/chupa-text/data.rb', line 66 def expected_screenshot_size @expected_screenshot_size end |
#limit_as ⇒ Numeric, ...
Returns the max memory on extraction by external command.
80 81 82 |
# File 'lib/chupa-text/data.rb', line 80 def limit_as @limit_as end |
#limit_cpu ⇒ Numeric, ...
Returns the max CPU time on extraction by external command.
76 77 78 |
# File 'lib/chupa-text/data.rb', line 76 def limit_cpu @limit_cpu end |
#max_body_size ⇒ Integer?
Returns the max body size in bytes.
69 70 71 |
# File 'lib/chupa-text/data.rb', line 69 def max_body_size @max_body_size end |
#need_screenshot=(value) ⇒ Bool (writeonly)
Returns the specified value.
63 64 65 |
# File 'lib/chupa-text/data.rb', line 63 def need_screenshot=(value) @need_screenshot = value end |
#path ⇒ String?
Returns The path associated with the content of
the data, nil
if the data doesn’t associated with any file.
The path may not be related with the original content. For
example, "/tmp/XXX.txt"
may be returned for the data of
"http://example.com/XXX.txt"
.
This value is useful to use an external command to extract text and meta-data.
47 48 49 |
# File 'lib/chupa-text/data.rb', line 47 def path @path end |
#screenshot ⇒ Screenshot?
Returns The screenshot of the data. For example, the first page image for PDF file.text.
59 60 61 |
# File 'lib/chupa-text/data.rb', line 59 def screenshot @screenshot end |
#size ⇒ Integer?
Returns The byte size of the data, nil
if the data
doesn’t have any content.
36 37 38 |
# File 'lib/chupa-text/data.rb', line 36 def size @size end |
#source ⇒ Data?
Returns The source of the data. For example, text
data (hello.txt
) in archive data (hello.tar
) have the
archive data in #source.
55 56 57 |
# File 'lib/chupa-text/data.rb', line 55 def source @source end |
#timeout ⇒ Numeric, ...
Returns the timeout on extraction.
72 73 74 |
# File 'lib/chupa-text/data.rb', line 72 def timeout @timeout end |
#uri ⇒ URI?
Returns The URI of the data if the data is for remote
or local file, nil
if the data isn’t associated with any
URIs.
28 29 30 |
# File 'lib/chupa-text/data.rb', line 28 def uri @uri end |
Instance Method Details
#[](name) ⇒ Object
167 168 169 |
# File 'lib/chupa-text/data.rb', line 167 def [](name) @attributes[name] end |
#[]=(name, value) ⇒ Object
171 172 173 |
# File 'lib/chupa-text/data.rb', line 171 def []=(name, value) @attributes[name] = value end |
#extension ⇒ String?
Returns Normalized extension as String if #uri
is not nil
, nil
otherwise. The normalized extension uses
lower case like pdf
not PDF
.
193 194 195 196 197 198 199 200 |
# File 'lib/chupa-text/data.rb', line 193 def extension return nil if @uri.nil? if @uri.is_a?(URI::HTTP) and @uri.path.end_with?("/") "html" else File.extname(@uri.path).downcase.gsub(/\A\./, "") end end |
#initialize_copy(object) ⇒ Object
105 106 107 108 109 |
# File 'lib/chupa-text/data.rb', line 105 def initialize_copy(object) super @attributes = @attributes.dup self end |
#merge!(data) ⇒ void
This method returns an undefined value.
Merges metadata from data.
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/chupa-text/data.rb', line 116 def merge!(data) self.uri = data.uri self.path = data.path data.attributes.each do |name, value| self[name] = value end if data.mime_type self["source-mime-types"] ||= [] self["source-mime-types"].unshift(data.mime_type) end self.need_screenshot = data.need_screenshot? self.expected_screenshot_size = data.expected_screenshot_size self.max_body_size = data.max_body_size self.timeout = data.timeout self.limit_cpu = data.limit_cpu self.limit_as = data.limit_as end |
#mime_type ⇒ String?
179 180 181 |
# File 'lib/chupa-text/data.rb', line 179 def mime_type @mime_type || guess_mime_type end |
#mime_type=(type) ⇒ Object
186 187 188 |
# File 'lib/chupa-text/data.rb', line 186 def mime_type=(type) @mime_type = type end |
#need_screenshot? ⇒ Bool
Returns true
when screenshot is needed if available.
215 216 217 |
# File 'lib/chupa-text/data.rb', line 215 def need_screenshot? @need_screenshot end |
#open {|StringIO.new(body)| ... } ⇒ Object
154 155 156 |
# File 'lib/chupa-text/data.rb', line 154 def open yield(StringIO.new(body)) end |
#peek_body(size) ⇒ Object
161 162 163 164 165 |
# File 'lib/chupa-text/data.rb', line 161 def peek_body(size) _body = body return nil if _body.nil? _body[0, size] end |
#release ⇒ Object
158 159 |
# File 'lib/chupa-text/data.rb', line 158 def release end |
#text? ⇒ Bool
Returns true if MIME type is “text/XXX”, false otherwise.
204 205 206 |
# File 'lib/chupa-text/data.rb', line 204 def text? (mime_type || "").start_with?("text/") end |
#text_plain? ⇒ Bool
Returns true if MIME type is “text/plain”, false otherwise.
210 211 212 |
# File 'lib/chupa-text/data.rb', line 210 def text_plain? mime_type == "text/plain" end |
#to_utf8_body_data ⇒ Object
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 |
# File 'lib/chupa-text/data.rb', line 219 def to_utf8_body_data b = nil if @max_body_size open do |input| b = input.read(@max_body_size) end else b = body end return self if b.nil? converter = UTF8Converter.new(b) utf8_body = converter.convert if @max_body_size.nil? and b.equal?(utf8_body) self else TextData.new(utf8_body, source_data: self) end end |