Class: Rpdfium::Util::LabelMatcher
- Inherits:
-
Object
- Object
- Rpdfium::Util::LabelMatcher
- Defined in:
- lib/rpdfium/util/label_matcher.rb
Overview
Associates semantic labels with values placed on PDFs of filled-in forms (F24, VAT communications, Modello 770) where template and data coexist as graphic text in different fonts.
Base strategy:
-
Cluster the template words into “coherent labels”: words that are geometrically close form a single label.
-
**For each value** search for:
-
‘:col` — the label ABOVE in the same column
-
‘:row` — the label TO THE LEFT in the same row
-
-
(Optional) **Column reassignment**: uses ‘ColumnInference` to identify repetitive columns (e.g. ST2..ST13 of the 770 Quadro ST) and propagates the canonical header to all the values in the column, overriding the `col_max_dy` limit.
Constant Summary collapse
- DEFAULT_COL_MAX_DY =
80.0- DEFAULT_ROW_MAX_DX =
200.0- DEFAULT_COL_X_TOLERANCE =
10.0- DEFAULT_ROW_Y_TOLERANCE =
2.0- DEFAULT_CLUSTER_SAME_ROW_DY =
4.0- DEFAULT_CLUSTER_SAME_ROW_DX =
12.0- DEFAULT_CLUSTER_ADJ_ROW_DY =
4.0- DEFAULT_IGNORE_LABEL_PATTERN =
/\A\d{1,3}\z|\A[IVX]{1,5}\z/.freeze
- WIDE_VALUE_THRESHOLD =
60.0
Instance Method Summary collapse
-
#cluster_anchors(anchor_words) ⇒ Object
Reconstructs the labels from the cluster of template words.
-
#initialize(col_max_dy: DEFAULT_COL_MAX_DY, row_max_dx: DEFAULT_ROW_MAX_DX, col_x_tolerance: DEFAULT_COL_X_TOLERANCE, row_y_tolerance: DEFAULT_ROW_Y_TOLERANCE, cluster_same_row_dy: DEFAULT_CLUSTER_SAME_ROW_DY, cluster_same_row_dx: DEFAULT_CLUSTER_SAME_ROW_DX, cluster_adj_row_dy: DEFAULT_CLUSTER_ADJ_ROW_DY, ignore_label_pattern: DEFAULT_IGNORE_LABEL_PATTERN, column_inference: nil) ⇒ LabelMatcher
constructor
A new instance of LabelMatcher.
-
#match(values, anchors) ⇒ Array<Hash>
Computes the label → value associations.
Constructor Details
#initialize(col_max_dy: DEFAULT_COL_MAX_DY, row_max_dx: DEFAULT_ROW_MAX_DX, col_x_tolerance: DEFAULT_COL_X_TOLERANCE, row_y_tolerance: DEFAULT_ROW_Y_TOLERANCE, cluster_same_row_dy: DEFAULT_CLUSTER_SAME_ROW_DY, cluster_same_row_dx: DEFAULT_CLUSTER_SAME_ROW_DX, cluster_adj_row_dy: DEFAULT_CLUSTER_ADJ_ROW_DY, ignore_label_pattern: DEFAULT_IGNORE_LABEL_PATTERN, column_inference: nil) ⇒ LabelMatcher
Returns a new instance of LabelMatcher.
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/rpdfium/util/label_matcher.rb', line 43 def initialize(col_max_dy: DEFAULT_COL_MAX_DY, row_max_dx: DEFAULT_ROW_MAX_DX, col_x_tolerance: DEFAULT_COL_X_TOLERANCE, row_y_tolerance: DEFAULT_ROW_Y_TOLERANCE, cluster_same_row_dy: DEFAULT_CLUSTER_SAME_ROW_DY, cluster_same_row_dx: DEFAULT_CLUSTER_SAME_ROW_DX, cluster_adj_row_dy: DEFAULT_CLUSTER_ADJ_ROW_DY, ignore_label_pattern: DEFAULT_IGNORE_LABEL_PATTERN, column_inference: nil) @col_max_dy = col_max_dy @row_max_dx = row_max_dx @col_x_tolerance = col_x_tolerance @row_y_tolerance = row_y_tolerance @cluster_same_row_dy = cluster_same_row_dy @cluster_same_row_dx = cluster_same_row_dx @cluster_adj_row_dy = cluster_adj_row_dy @ignore_label_pattern = ignore_label_pattern @column_inference = column_inference end |
Instance Method Details
#cluster_anchors(anchor_words) ⇒ Object
Reconstructs the labels from the cluster of template words. Exposed publicly for inspection/debug.
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/rpdfium/util/label_matcher.rb', line 97 def cluster_anchors(anchor_words) remaining = anchor_words.dup groups = [] until remaining.empty? seed = remaining.shift group = [seed] grew = true while grew grew = false remaining.dup.each do |w| close = group.any? do |g| dx_horiz = [w[:x0] - g[:x1], g[:x0] - w[:x1]].max same_row = (w[:top] - g[:top]).abs < @cluster_same_row_dy && dx_horiz < @cluster_same_row_dx dy_above = (g[:top] - w[:bottom]).abs dy_below = (w[:top] - g[:bottom]).abs vertical_adjacent = [dy_above, dy_below].min < @cluster_adj_row_dy x_overlap = !(w[:x1] < g[:x0] - 3 || w[:x0] > g[:x1] + 3) adj_row = vertical_adjacent && x_overlap same_row || adj_row end if close group << w remaining.delete(w) grew = true end end end groups << group end labels = groups.map { |g| group_to_label(g) } if @ignore_label_pattern labels = labels.reject { |l| l[:text].match?(@ignore_label_pattern) } end labels end |
#match(values, anchors) ⇒ Array<Hash>
Computes the label → value associations.
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/rpdfium/util/label_matcher.rb', line 68 def match(values, anchors) labels = cluster_anchors(anchors) prelim = values.map do |v| col = find_col_label(v, labels) row = find_row_label(v, labels) { value: v, col: col, row: row } end # Optional reassignment for repetitive columns prelim = reassign_by_columns(prelim, labels, values) if @column_inference prelim.map do |entry| v = entry[:value] { value: v[:text], labels: { col: entry[:col]&.dig(:text), row: entry[:row]&.dig(:text) }, geometry: { x0: v[:x0], x1: v[:x1], top: v[:top], bottom: v[:bottom] } } end end |