Class: Scrapetor::Schema

Inherits:
Object
  • Object
show all
Defined in:
lib/scrapetor/schema.rb

Defined Under Namespace

Classes: Field, Group

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeSchema

Returns a new instance of Schema.



13
14
15
16
# File 'lib/scrapetor/schema.rb', line 13

def initialize
  @fields = []
  @groups = []
end

Instance Attribute Details

#fieldsObject (readonly)

Returns the value of attribute fields.



11
12
13
# File 'lib/scrapetor/schema.rb', line 11

def fields
  @fields
end

#groupsObject (readonly)

Returns the value of attribute groups.



11
12
13
# File 'lib/scrapetor/schema.rb', line 11

def groups
  @groups
end

Class Method Details

.build(&block) ⇒ Object



18
19
20
21
22
# File 'lib/scrapetor/schema.rb', line 18

def self.build(&block)
  s = new
  s.instance_eval(&block) if block
  s
end

.dump_to_file(schema, path) ⇒ Object



73
74
75
76
# File 'lib/scrapetor/schema.rb', line 73

def self.dump_to_file(schema, path)
  File.binwrite(path, schema.dump)
  path
end

.dumpable(schema) ⇒ Object

Convert a schema to a portable Hash (no procs).



83
84
85
86
87
88
# File 'lib/scrapetor/schema.rb', line 83

def self.dumpable(schema)
  {
    fields: schema.fields.map { |f| field_to_h(f) },
    groups: schema.groups.map { |g| group_to_h(g) }
  }
end

.field_from_h(h) ⇒ Object



123
124
125
126
127
128
129
# File 'lib/scrapetor/schema.rb', line 123

def self.field_from_h(h)
  Field.new(
    h[:name], h[:selector], h[:attr], h[:attr_str], h[:type],
    h[:clean], h[:multi], h[:normalize_url], h[:default],
    h[:required], nil, h[:delimiter]
  )
end

.field_to_h(f) ⇒ Object

Raises:



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/scrapetor/schema.rb', line 90

def self.field_to_h(f)
  raise SchemaError, "transform: blocks can't be serialized" if f.transform
  {
    name:          f.name,
    selector:      f.selector,
    attr:          f.attr,
    attr_str:      f.attr_str,
    type:          f.type,
    clean:         f.clean,
    multi:         f.multi,
    normalize_url: f.normalize_url,
    default:       f.default,
    required:      f.required,
    delimiter:     f.delimiter
  }
end

.group_from_h(h) ⇒ Object



131
132
133
134
135
136
137
138
# File 'lib/scrapetor/schema.rb', line 131

def self.group_from_h(h)
  Group.new(
    h[:name],
    h[:selector],
    h[:fields].map { |fh| field_from_h(fh) },
    h[:groups].map { |gh| group_from_h(gh) }
  )
end

.group_to_h(g) ⇒ Object



107
108
109
110
111
112
113
114
# File 'lib/scrapetor/schema.rb', line 107

def self.group_to_h(g)
  {
    name:     g.name,
    selector: g.selector,
    fields:   g.fields.map { |f| field_to_h(f) },
    groups:   g.groups.map { |sub| group_to_h(sub) }
  }
end

.load(blob) ⇒ Object



69
70
71
# File 'lib/scrapetor/schema.rb', line 69

def self.load(blob)
  new_from_h(Marshal.load(blob)) # rubocop:disable Security/MarshalLoad
end

.load_file(path) ⇒ Object



78
79
80
# File 'lib/scrapetor/schema.rb', line 78

def self.load_file(path)
  load(File.binread(path))
end

.new_from_h(h) ⇒ Object



116
117
118
119
120
121
# File 'lib/scrapetor/schema.rb', line 116

def self.new_from_h(h)
  schema = new
  h[:fields].each { |fh| schema.fields << field_from_h(fh) }
  h[:groups].each { |gh| schema.groups << group_from_h(gh) }
  schema
end

Instance Method Details

#dumpObject

—– Cross-process plan cache —–

Serialize a schema to a binary blob (Marshal) so a worker can restore the compiled descriptor without re-parsing the Ruby DSL. Schemas using ‘transform:` (procs) can’t be dumped — those plans must be rebuilt from source.



65
66
67
# File 'lib/scrapetor/schema.rb', line 65

def dump
  Marshal.dump(self.class.dumpable(self))
end

#field(name, from:, attr: nil, type: :text, clean: false, multi: false, normalize_url: false, default: nil, required: false, transform: nil, delimiter: /\s*,\s*/) ⇒ Object

field :name, from: SELECTOR, attr: SYM, type: SYM,

clean: BOOL, multi: BOOL, normalize_url: BOOL,
default: VALUE, required: BOOL,
transform: PROC, delimiter: STRING_OR_REGEX

from: may be a String selector or an Array of selectors (tried in order until one matches).

Types: :text :integer :float :money :url :date :json :html :list

:boolean :array (alias for multi:true)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/scrapetor/schema.rb', line 34

def field(name,
          from:,
          attr: nil,
          type: :text,
          clean: false,
          multi: false,
          normalize_url: false,
          default: nil,
          required: false,
          transform: nil,
          delimiter: /\s*,\s*/)
  multi = true if type == :array
  type  = :text if type == :array
  @fields << Field.new(
    name, from, attr, attr && attr.to_s, type, clean, multi,
    normalize_url, default, required, transform, delimiter
  )
end

#repeated(selector, as:, &block) ⇒ Object



53
54
55
56
# File 'lib/scrapetor/schema.rb', line 53

def repeated(selector, as:, &block)
  sub = self.class.build(&block)
  @groups << Group.new(as, selector, sub.fields, sub.groups)
end

#to_hObject



140
141
142
# File 'lib/scrapetor/schema.rb', line 140

def to_h
  self.class.dumpable(self)
end