Class: ChupaText::Data

Inherits:
Object
  • Object
show all
Defined in:
lib/chupa-text/data.rb

Direct Known Subclasses

InputData, TextData, VirtualFileData

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Data

Returns a new instance of Data.



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/chupa-text/data.rb', line 82

def initialize(options={})
  @uri = nil
  @body = nil
  @size = nil
  @path = nil
  @mime_type = nil
  @attributes = Attributes.new
  @source = nil
  @screenshot = nil
  @need_screenshot = true
  @expected_screenshot_size = [200, 200]
  @max_body_size = nil
  @timeout = nil
  @limit_cpu = nil
  @limit_as = nil
  @options = options || {}
  source_data = @options[:source_data]
  if source_data
    merge!(source_data)
    @source = source_data
  end
end

Instance Attribute Details

#attributesAttributes (readonly)

Returns The attributes of the data.

Returns:



50
51
52
# File 'lib/chupa-text/data.rb', line 50

def attributes
  @attributes
end

#bodyString?

Returns The content of the data, nil if the data doesn’t have any content.

Returns:

  • (String, nil)

    The content of the data, nil if the data doesn’t have any content.



32
33
34
# File 'lib/chupa-text/data.rb', line 32

def body
  @body
end

#expected_screenshot_sizeArray<Integer, Integer>

Returns the expected screenshot size.

Returns:

  • (Array<Integer, Integer>)

    the expected screenshot size.



66
67
68
# File 'lib/chupa-text/data.rb', line 66

def expected_screenshot_size
  @expected_screenshot_size
end

#limit_asNumeric, ...

Returns the max memory on extraction by external command.

Returns:

  • (Numeric, String, nil)

    the max memory on extraction by external command.



80
81
82
# File 'lib/chupa-text/data.rb', line 80

def limit_as
  @limit_as
end

#limit_cpuNumeric, ...

Returns the max CPU time on extraction by external command.

Returns:

  • (Numeric, String, nil)

    the max CPU time on extraction by external command.



76
77
78
# File 'lib/chupa-text/data.rb', line 76

def limit_cpu
  @limit_cpu
end

#max_body_sizeInteger?

Returns the max body size in bytes.

Returns:

  • (Integer, nil)

    the max body size in bytes.



69
70
71
# File 'lib/chupa-text/data.rb', line 69

def max_body_size
  @max_body_size
end

#need_screenshot=(value) ⇒ Bool (writeonly)

Returns the specified value.

Parameters:

  • value (Bool)

    true when screenshot is needed.

Returns:

  • (Bool)

    the specified value



63
64
65
# File 'lib/chupa-text/data.rb', line 63

def need_screenshot=(value)
  @need_screenshot = value
end

#pathString?

Returns The path associated with the content of the data, nil if the data doesn’t associated with any file.

The path may not be related with the original content. For example, "/tmp/XXX.txt" may be returned for the data of "http://example.com/XXX.txt".

This value is useful to use an external command to extract text and meta-data.

Returns:

  • (String, nil)

    The path associated with the content of the data, nil if the data doesn’t associated with any file.

    The path may not be related with the original content. For example, "/tmp/XXX.txt" may be returned for the data of "http://example.com/XXX.txt".

    This value is useful to use an external command to extract text and meta-data.



47
48
49
# File 'lib/chupa-text/data.rb', line 47

def path
  @path
end

#screenshotScreenshot?

Returns The screenshot of the data. For example, the first page image for PDF file.text.

Returns:

  • (Screenshot, nil)

    The screenshot of the data. For example, the first page image for PDF file.text.



59
60
61
# File 'lib/chupa-text/data.rb', line 59

def screenshot
  @screenshot
end

#sizeInteger?

Returns The byte size of the data, nil if the data doesn’t have any content.

Returns:

  • (Integer, nil)

    The byte size of the data, nil if the data doesn’t have any content.



36
37
38
# File 'lib/chupa-text/data.rb', line 36

def size
  @size
end

#sourceData?

Returns The source of the data. For example, text data (hello.txt) in archive data (hello.tar) have the archive data in #source.

Returns:

  • (Data, nil)

    The source of the data. For example, text data (hello.txt) in archive data (hello.tar) have the archive data in #source.



55
56
57
# File 'lib/chupa-text/data.rb', line 55

def source
  @source
end

#timeoutNumeric, ...

Returns the timeout on extraction.

Returns:

  • (Numeric, String, nil)

    the timeout on extraction.



72
73
74
# File 'lib/chupa-text/data.rb', line 72

def timeout
  @timeout
end

#uriURI?

Returns The URI of the data if the data is for remote or local file, nil if the data isn’t associated with any URIs.

Returns:

  • (URI, nil)

    The URI of the data if the data is for remote or local file, nil if the data isn’t associated with any URIs.



28
29
30
# File 'lib/chupa-text/data.rb', line 28

def uri
  @uri
end

Instance Method Details

#[](name) ⇒ Object



167
168
169
# File 'lib/chupa-text/data.rb', line 167

def [](name)
  @attributes[name]
end

#[]=(name, value) ⇒ Object



171
172
173
# File 'lib/chupa-text/data.rb', line 171

def []=(name, value)
  @attributes[name] = value
end

#extensionString?

Returns Normalized extension as String if #uri is not nil, nil otherwise. The normalized extension uses lower case like pdf not PDF.

Returns:

  • (String, nil)

    Normalized extension as String if #uri is not nil, nil otherwise. The normalized extension uses lower case like pdf not PDF.



193
194
195
196
197
198
199
200
# File 'lib/chupa-text/data.rb', line 193

def extension
  return nil if @uri.nil?
  if @uri.is_a?(URI::HTTP) and @uri.path.end_with?("/")
    "html"
  else
    File.extname(@uri.path).downcase.gsub(/\A\./, "")
  end
end

#initialize_copy(object) ⇒ Object



105
106
107
108
109
# File 'lib/chupa-text/data.rb', line 105

def initialize_copy(object)
  super
  @attributes = @attributes.dup
  self
end

#merge!(data) ⇒ void

This method returns an undefined value.

Merges metadata from data.

Parameters:

  • data (Data)

    The data to be merged.



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/chupa-text/data.rb', line 116

def merge!(data)
  self.uri = data.uri
  self.path = data.path
  data.attributes.each do |name, value|
    self[name] = value
  end
  if data.mime_type
    self["source-mime-types"] ||= []
    self["source-mime-types"].unshift(data.mime_type)
  end
  self.need_screenshot = data.need_screenshot?
  self.expected_screenshot_size = data.expected_screenshot_size
  self.max_body_size = data.max_body_size
  self.timeout = data.timeout
  self.limit_cpu = data.limit_cpu
  self.limit_as = data.limit_as
end

#mime_typeString?

Returns:

  • (String)

    The MIME type of the data. If MIME type isn’t set, guesses MIME type from path and body.

  • (nil)

    If MIME type isn’t set and it can’t guess MIME type from path and body.



179
180
181
# File 'lib/chupa-text/data.rb', line 179

def mime_type
  @mime_type || guess_mime_type
end

#mime_type=(type) ⇒ Object

Parameters:

  • type (String, nil)

    The MIME type of the data. You can unset MIME type by nil. If you unset MIME type, MIME type is guessed from path and body of the data.



186
187
188
# File 'lib/chupa-text/data.rb', line 186

def mime_type=(type)
  @mime_type = type
end

#need_screenshot?Bool

Returns true when screenshot is needed if available.

Returns:

  • (Bool)

    true when screenshot is needed if available.



215
216
217
# File 'lib/chupa-text/data.rb', line 215

def need_screenshot?
  @need_screenshot
end

#open {|StringIO.new(body)| ... } ⇒ Object

Yields:

  • (StringIO.new(body))


154
155
156
# File 'lib/chupa-text/data.rb', line 154

def open
  yield(StringIO.new(body))
end

#peek_body(size) ⇒ Object



161
162
163
164
165
# File 'lib/chupa-text/data.rb', line 161

def peek_body(size)
  _body = body
  return nil if _body.nil?
  _body[0, size]
end

#releaseObject



158
159
# File 'lib/chupa-text/data.rb', line 158

def release
end

#text?Bool

Returns true if MIME type is “text/XXX”, false otherwise.

Returns:

  • (Bool)

    true if MIME type is “text/XXX”, false otherwise.



204
205
206
# File 'lib/chupa-text/data.rb', line 204

def text?
  (mime_type || "").start_with?("text/")
end

#text_plain?Bool

Returns true if MIME type is “text/plain”, false otherwise.

Returns:

  • (Bool)

    true if MIME type is “text/plain”, false otherwise.



210
211
212
# File 'lib/chupa-text/data.rb', line 210

def text_plain?
  mime_type == "text/plain"
end

#to_utf8_body_dataObject



219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/chupa-text/data.rb', line 219

def to_utf8_body_data
  b = nil
  if @max_body_size
    open do |input|
      b = input.read(@max_body_size)
    end
  else
    b = body
  end
  return self if b.nil?

  converter = UTF8Converter.new(b)
  utf8_body = converter.convert
  if @max_body_size.nil? and b.equal?(utf8_body)
    self
  else
    TextData.new(utf8_body, source_data: self)
  end
end