Class: ChupaText::Data
- Inherits:
- 
      Object
      
        - Object
- ChupaText::Data
 
- Defined in:
- lib/chupa-text/data.rb
Direct Known Subclasses
Instance Attribute Summary collapse
- 
  
    
      #attributes  ⇒ Attributes 
    
    
  
  
  
  
    
      readonly
    
    
  
  
  
  
  
  
    The attributes of the data. 
- 
  
    
      #body  ⇒ String? 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The content of the data, nilif the data doesn’t have any content.
- 
  
    
      #expected_screenshot_size  ⇒ Array<Integer, Integer> 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The expected screenshot size. 
- 
  
    
      #limit_as  ⇒ Numeric, ... 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The max memory on extraction by external command. 
- 
  
    
      #limit_cpu  ⇒ Numeric, ... 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The max CPU time on extraction by external command. 
- 
  
    
      #max_body_size  ⇒ Integer? 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The max body size in bytes. 
- 
  
    
      #need_screenshot  ⇒ Bool 
    
    
  
  
  
  
    
    
      writeonly
    
  
  
  
  
  
  
    The specified value. 
- 
  
    
      #path  ⇒ String? 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The path associated with the content of the data, nilif the data doesn’t associated with any file.
- 
  
    
      #screenshot  ⇒ Screenshot? 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The screenshot of the data. 
- 
  
    
      #size  ⇒ Integer? 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The byte size of the data, nilif the data doesn’t have any content.
- 
  
    
      #source  ⇒ Data? 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The source of the data. 
- 
  
    
      #timeout  ⇒ Numeric, ... 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The timeout on extraction. 
- 
  
    
      #uri  ⇒ URI? 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    The URI of the data if the data is for remote or local file, nilif the data isn’t associated with any URIs.
Instance Method Summary collapse
- 
  
    
      #[](name)  ⇒ Object 
    
    
  
  
  
  
  
  
  
  
  
    
- 
  
    
      #[]=(name, value)  ⇒ Object 
    
    
  
  
  
  
  
  
  
  
  
    
- 
  
    
      #extension  ⇒ String? 
    
    
  
  
  
  
  
  
  
  
  
    Normalized extension as String if #uri is not nil,nilotherwise.
- 
  
    
      #initialize(options = {})  ⇒ Data 
    
    
  
  
  
    constructor
  
  
  
  
  
  
  
    A new instance of Data. 
- 
  
    
      #initialize_copy(object)  ⇒ Object 
    
    
  
  
  
  
  
  
  
  
  
    
- 
  
    
      #merge!(data)  ⇒ void 
    
    
  
  
  
  
  
  
  
  
  
    Merges metadata from data. 
- 
  
    
      #mime_type  ⇒ String? 
    
    
  
  
  
  
  
  
  
  
  
    
- 
  
    
      #mime_type=(type)  ⇒ Object 
    
    
  
  
  
  
  
  
  
  
  
    
- 
  
    
      #need_screenshot?  ⇒ Bool 
    
    
  
  
  
  
  
  
  
  
  
    truewhen screenshot is needed if available.
- 
  
    
      #open {|StringIO.new(body)| ... } ⇒ Object 
    
    
  
  
  
  
  
  
  
  
  
    
- 
  
    
      #peek_body(size)  ⇒ Object 
    
    
  
  
  
  
  
  
  
  
  
    
- 
  
    
      #release  ⇒ Object 
    
    
  
  
  
  
  
  
  
  
  
    
- 
  
    
      #text?  ⇒ Bool 
    
    
  
  
  
  
  
  
  
  
  
    True if MIME type is “text/XXX”, false otherwise. 
- 
  
    
      #text_plain?  ⇒ Bool 
    
    
  
  
  
  
  
  
  
  
  
    True if MIME type is “text/plain”, false otherwise. 
- 
  
    
      #to_utf8_body_data  ⇒ Object 
    
    
  
  
  
  
  
  
  
  
  
    
Constructor Details
#initialize(options = {}) ⇒ Data
Returns a new instance of Data.
| 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | # File 'lib/chupa-text/data.rb', line 82 def initialize(={}) @uri = nil @body = nil @size = nil @path = nil @mime_type = nil @attributes = Attributes.new @source = nil @screenshot = nil @need_screenshot = true @expected_screenshot_size = [200, 200] @max_body_size = nil @timeout = nil @limit_cpu = nil @limit_as = nil @options = || {} source_data = @options[:source_data] if source_data merge!(source_data) @source = source_data end end | 
Instance Attribute Details
#attributes ⇒ Attributes (readonly)
Returns The attributes of the data.
| 50 51 52 | # File 'lib/chupa-text/data.rb', line 50 def attributes @attributes end | 
#body ⇒ String?
Returns The content of the data, nil if the data
doesn’t have any content.
| 32 33 34 | # File 'lib/chupa-text/data.rb', line 32 def body @body end | 
#expected_screenshot_size ⇒ Array<Integer, Integer>
Returns the expected screenshot size.
| 66 67 68 | # File 'lib/chupa-text/data.rb', line 66 def expected_screenshot_size @expected_screenshot_size end | 
#limit_as ⇒ Numeric, ...
Returns the max memory on extraction by external command.
| 80 81 82 | # File 'lib/chupa-text/data.rb', line 80 def limit_as @limit_as end | 
#limit_cpu ⇒ Numeric, ...
Returns the max CPU time on extraction by external command.
| 76 77 78 | # File 'lib/chupa-text/data.rb', line 76 def limit_cpu @limit_cpu end | 
#max_body_size ⇒ Integer?
Returns the max body size in bytes.
| 69 70 71 | # File 'lib/chupa-text/data.rb', line 69 def max_body_size @max_body_size end | 
#need_screenshot=(value) ⇒ Bool (writeonly)
Returns the specified value.
| 63 64 65 | # File 'lib/chupa-text/data.rb', line 63 def need_screenshot=(value) @need_screenshot = value end | 
#path ⇒ String?
Returns The path associated with the content of
the data, nil if the data doesn’t associated with any file.
The path may not be related with the original content. For
example, "/tmp/XXX.txt" may be returned for the data of
"http://example.com/XXX.txt".
This value is useful to use an external command to extract text and meta-data.
| 47 48 49 | # File 'lib/chupa-text/data.rb', line 47 def path @path end | 
#screenshot ⇒ Screenshot?
Returns The screenshot of the data. For example, the first page image for PDF file.text.
| 59 60 61 | # File 'lib/chupa-text/data.rb', line 59 def screenshot @screenshot end | 
#size ⇒ Integer?
Returns The byte size of the data, nil if the data
doesn’t have any content.
| 36 37 38 | # File 'lib/chupa-text/data.rb', line 36 def size @size end | 
#source ⇒ Data?
Returns The source of the data. For example, text
data (hello.txt) in archive data (hello.tar) have the
archive data in #source.
| 55 56 57 | # File 'lib/chupa-text/data.rb', line 55 def source @source end | 
#timeout ⇒ Numeric, ...
Returns the timeout on extraction.
| 72 73 74 | # File 'lib/chupa-text/data.rb', line 72 def timeout @timeout end | 
#uri ⇒ URI?
Returns The URI of the data if the data is for remote
or local file, nil if the data isn’t associated with any
URIs.
| 28 29 30 | # File 'lib/chupa-text/data.rb', line 28 def uri @uri end | 
Instance Method Details
#[](name) ⇒ Object
| 167 168 169 | # File 'lib/chupa-text/data.rb', line 167 def [](name) @attributes[name] end | 
#[]=(name, value) ⇒ Object
| 171 172 173 | # File 'lib/chupa-text/data.rb', line 171 def []=(name, value) @attributes[name] = value end | 
#extension ⇒ String?
Returns Normalized extension as String if #uri
is not nil, nil otherwise. The normalized extension uses
lower case like pdf not PDF.
| 193 194 195 196 197 198 199 200 | # File 'lib/chupa-text/data.rb', line 193 def extension return nil if @uri.nil? if @uri.is_a?(URI::HTTP) and @uri.path.end_with?("/") "html" else File.extname(@uri.path).downcase.gsub(/\A\./, "") end end | 
#initialize_copy(object) ⇒ Object
| 105 106 107 108 109 | # File 'lib/chupa-text/data.rb', line 105 def initialize_copy(object) super @attributes = @attributes.dup self end | 
#merge!(data) ⇒ void
This method returns an undefined value.
Merges metadata from data.
| 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | # File 'lib/chupa-text/data.rb', line 116 def merge!(data) self.uri = data.uri self.path = data.path data.attributes.each do |name, value| self[name] = value end if data.mime_type self["source-mime-types"] ||= [] self["source-mime-types"].unshift(data.mime_type) end self.need_screenshot = data.need_screenshot? self.expected_screenshot_size = data.expected_screenshot_size self.max_body_size = data.max_body_size self.timeout = data.timeout self.limit_cpu = data.limit_cpu self.limit_as = data.limit_as end | 
#mime_type ⇒ String?
| 179 180 181 | # File 'lib/chupa-text/data.rb', line 179 def mime_type @mime_type || guess_mime_type end | 
#mime_type=(type) ⇒ Object
| 186 187 188 | # File 'lib/chupa-text/data.rb', line 186 def mime_type=(type) @mime_type = type end | 
#need_screenshot? ⇒ Bool
Returns true when screenshot is needed if available.
| 215 216 217 | # File 'lib/chupa-text/data.rb', line 215 def need_screenshot? @need_screenshot end | 
#open {|StringIO.new(body)| ... } ⇒ Object
| 154 155 156 | # File 'lib/chupa-text/data.rb', line 154 def open yield(StringIO.new(body)) end | 
#peek_body(size) ⇒ Object
| 161 162 163 164 165 | # File 'lib/chupa-text/data.rb', line 161 def peek_body(size) _body = body return nil if _body.nil? _body[0, size] end | 
#release ⇒ Object
| 158 159 | # File 'lib/chupa-text/data.rb', line 158 def release end | 
#text? ⇒ Bool
Returns true if MIME type is “text/XXX”, false otherwise.
| 204 205 206 | # File 'lib/chupa-text/data.rb', line 204 def text? (mime_type || "").start_with?("text/") end | 
#text_plain? ⇒ Bool
Returns true if MIME type is “text/plain”, false otherwise.
| 210 211 212 | # File 'lib/chupa-text/data.rb', line 210 def text_plain? mime_type == "text/plain" end | 
#to_utf8_body_data ⇒ Object
| 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 | # File 'lib/chupa-text/data.rb', line 219 def to_utf8_body_data b = nil if @max_body_size open do |input| b = input.read(@max_body_size) end else b = body end return self if b.nil? converter = UTF8Converter.new(b) utf8_body = converter.convert if @max_body_size.nil? and b.equal?(utf8_body) self else TextData.new(utf8_body, source_data: self) end end |