Module: AcroForge::Relabeler

Defined in:
lib/acroforge/relabeler.rb

Constant Summary collapse

KEY_REGEX =
/\A[a-z][a-z0-9_]*\z/

Class Method Summary collapse

Class Method Details

.apply!(pdf_path, mapping_path) ⇒ Object

Raises:



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/acroforge/relabeler.rb', line 19

def apply!(pdf_path, mapping_path)
  data = YAML.load_file(mapping_path) || {}
  entries = data.reject { |k, _| k.to_s.start_with?("_") }

  validate!(entries)

  doc = HexaPDF::Document.open(pdf_path)
  form = doc.acro_form(create: false)
  raise RelabelError, "PDF has no AcroForm: #{pdf_path}" unless form

  renamed = 0
  disambiguated = 0
  skipped_null = 0
  stale = 0

  # Build a synthetic-name -> field index using the same naming scheme
  # the engine emits during compile!. This handles PDFs where multiple
  # fields share the same :T name: the mapping refers to "date",
  # "date#1", "date#2", and each one resolves to the right field.
  field_index = AcroForge::Engine.field_index(form)

  claimed = {}
  entries.each do |pdf_name, entry|
    key = entry["key"]
    if key.nil? || key.to_s.empty?
      skipped_null += 1
      next
    end

    field = field_index[pdf_name]
    unless field
      stale += 1
      warn "acroforge: stale entry #{pdf_name.inspect} not found in PDF (skipping)"
      next
    end

    target = key.to_s
    counter = 1
    while claimed.key?(target)
      target = "#{key}_#{counter}"
      counter += 1
    end
    disambiguated += 1 if target != key.to_s
    claimed[target] = true

    field[:T] = target
    field[:TU] = target
    renamed += 1
  end

  doc.write(pdf_path)

  {
    total: entries.size,
    renamed: renamed,
    disambiguated: disambiguated,
    skipped_null: skipped_null,
    stale: stale
  }
end

.build_entry(proposal, prior) ⇒ Object



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/acroforge/relabeler.rb', line 126

def build_entry(proposal, prior)
  proposed_key = proposal[:canonical_key]&.to_s
  proposed_type = infer_type(proposal).to_s

  key_value = prior&.key?("key") ? prior["key"] : proposed_key
  type_value = prior&.key?("type") ? prior["type"] : proposed_type

  meta = {
    "raw_label" => AcroForge::Schema.humanize_label(proposal[:raw_label]),
    "confidence" => proposal[:confidence].to_s,
    "section" => proposal[:section]&.to_s,
    "page" => proposal[:page]
  }
  options = proposal[:options]&.transform_keys(&:to_s)
  meta["options"] = options if options

  {
    "key" => key_value,
    "type" => type_value,
    "meta" => meta
  }
end

.infer_type(proposal) ⇒ Object



149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/acroforge/relabeler.rb', line 149

def infer_type(proposal)
  case proposal[:pdf_field_type]
  when :button
    ((proposal[:options]&.size || 0) > 1) ? :select : :boolean
  when :choice
    :select
  else
    label = proposal[:raw_label].to_s.downcase
    case label
    when /amount|salary|income|balance|fee|tier3/ then :money
    when /\bdate\b|birth|expiry|employed/ then :date
    when /email/ then :email
    when /years|tenor|number of|\bno\.?\b/ then :number
    else :string
    end
  end
end

.propose(pdf_path, out:, schema: {}, mode: :merge, engine: nil) ⇒ Object

Write a mapping YAML proposing semantic names for every AcroForm field.

If ‘engine:` is given, the caller has already compiled an engine and we use its proposals directly (no second compile). This lets callers like the CLI’s ‘bootstrap` subcommand share one compile pass with Schema.infer instead of running the engine twice.



97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/acroforge/relabeler.rb', line 97

def propose(pdf_path, out:, schema: {}, mode: :merge, engine: nil)
  existing = (mode == :merge && File.exist?(out)) ? YAML.load_file(out) : nil

  proposals = if engine
    engine.field_proposals
  else
    Dir.mktmpdir do |tmp|
      e = AcroForge::Engine.new(pdf_path, schema: schema, normalized_dir: tmp)
      e.compile!
      e.field_proposals
    end
  end

  sorted = proposals.sort_by { |p| [p[:page], -p[:y], p[:x]] }
  entries = sorted.each_with_object({}) do |p, acc|
    acc[p[:pdf_field_name]] = build_entry(p, existing&.[](p[:pdf_field_name]))
  end

  File.write(out, render_yaml(pdf_path, entries))

  mapped = entries.values.count { |e| !e["key"].nil? && !e["key"].to_s.empty? }
  {
    total: entries.size,
    mapped: mapped,
    unmapped: entries.size - mapped,
    out_path: out
  }
end

.render_yaml(pdf_path, entries) ⇒ Object



167
168
169
170
171
172
173
174
175
176
177
# File 'lib/acroforge/relabeler.rb', line 167

def render_yaml(pdf_path, entries)
  banner = {
    "_meta" => {
      "source_pdf" => pdf_path,
      "generated_at" => Time.now.utc.iso8601,
      "acroforge_version" => AcroForge::VERSION,
      "total_fields" => entries.size
    }
  }
  YAML.dump(banner.merge(entries))
end

.validate!(entries) ⇒ Object



80
81
82
83
84
85
86
87
88
89
# File 'lib/acroforge/relabeler.rb', line 80

def validate!(entries)
  entries.each do |pdf_name, entry|
    raise RelabelError, "reserved sentinel: #{pdf_name.inspect}" if pdf_name.to_s.start_with?("_")
    key = entry["key"]
    next if key.nil? || key.to_s.empty?
    unless key.to_s.match?(KEY_REGEX)
      raise RelabelError, "invalid key #{key.inspect} for field #{pdf_name.inspect}: must match #{KEY_REGEX.inspect}"
    end
  end
end