Class: Rpdfium::Table::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/rpdfium/table/extractor.rb

Overview

Finds tables on a page, faithful to ‘pdfplumber.TableFinder`.

Pipeline:

1. collect candidate edges for each axis, according to strategy
   (`:lines` / `:lines_strict` / `:text` / `:explicit`)
2. merge_edges (snap collinear + join contiguous)
3. filter by minimum length
4. edges_to_intersections with tolerance
5. intersections_to_cells (smallest cell for each point)
6. cells_to_tables (grouping by shared corners)

Public API:

ext = Rpdfium::Table::Extractor.new(page, **opts)
ext.tables           # => [Table, ...]   (Rpdfium::Table::Table objects)
ext.extract          # => [[[String]]]   (Array of tables, each table
                                           is an Array of rows, each row
                                           is an Array of strings)
ext.find             # alias of .tables (back-compat with 0.2.x)
ext.edges            # refined edges
ext.intersections    # Hash {[x,y] => {v:[],h:[]}}
ext.cells            # Array<bbox>

Constant Summary collapse

DEFAULTS =
{
  vertical_strategy:   :lines,
  horizontal_strategy: :lines,
  explicit_vertical_lines:   [],
  explicit_horizontal_lines: [],

  # Tolerances. The `_x_` / `_y_` inherit from the un-suffixed value.
  snap_tolerance:           3.0,
  snap_x_tolerance:         nil,
  snap_y_tolerance:         nil,
  join_tolerance:           3.0,
  join_x_tolerance:         nil,
  join_y_tolerance:         nil,

  edge_min_length:           3.0,
  edge_min_length_prefilter: 1.0,

  min_words_vertical:   Edges::DEFAULT_MIN_WORDS_VERTICAL,
  min_words_horizontal: Edges::DEFAULT_MIN_WORDS_HORIZONTAL,

  intersection_tolerance:   3.0,
  intersection_x_tolerance: nil,
  intersection_y_tolerance: nil,

  # Text settings (passed to TextExtraction when .extract is called).
  # The 3.0 defaults are those of pdfplumber.
  text_x_tolerance: Util::WordExtractor::DEFAULT_X_TOLERANCE,
  text_y_tolerance: Util::WordExtractor::DEFAULT_Y_TOLERANCE,
  text_keep_blank_chars: false,

  # Auto-fallback: if :lines produces no edges, retry with :text.
  # We keep the flag (it was already in 0.2.x) but ONLY as a fallback,
  # never as a "fix" for pathological layouts — consistent with
  # pdfplumber, which does not have it (pdfplumber users know they
  # must choose the strategy).
  auto_fallback: true
}.freeze
VALID_STRATEGIES =
%i[lines lines_strict text explicit].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(page, **opts) ⇒ Extractor

Returns a new instance of Extractor.



69
70
71
72
73
# File 'lib/rpdfium/table/extractor.rb', line 69

def initialize(page, **opts)
  @page = page
  @settings = resolve_settings(DEFAULTS.merge(opts))
  validate_strategies!
end

Instance Attribute Details

#pageObject (readonly)

Returns the value of attribute page.



67
68
69
# File 'lib/rpdfium/table/extractor.rb', line 67

def page
  @page
end

#settingsObject (readonly)

Returns the value of attribute settings.



67
68
69
# File 'lib/rpdfium/table/extractor.rb', line 67

def settings
  @settings
end

Instance Method Details

#cellsObject



98
99
100
# File 'lib/rpdfium/table/extractor.rb', line 98

def cells
  @cells ||= Cells.intersections_to_cells(intersections)
end

#edgesObject

Full pipeline, builds the refined edges.



76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/rpdfium/table/extractor.rb', line 76

def edges
  @edges ||= build_edges(@settings[:vertical_strategy],
                         @settings[:horizontal_strategy]).then do |built|
    if built.empty? && @settings[:auto_fallback] &&
       (@settings[:vertical_strategy] != :text ||
        @settings[:horizontal_strategy] != :text)
      # Fallback: the auto-fallback is LOOSE, retry everything as :text.
      build_edges(:text, :text)
    else
      built
    end
  end
end

#extract(**text_opts) ⇒ Object

Extract the data of all tables: Array<Array<Array<String>>>.



108
109
110
111
112
113
114
115
116
# File 'lib/rpdfium/table/extractor.rb', line 108

def extract(**text_opts)
  merged = {
    x_tolerance: @settings[:text_x_tolerance],
    y_tolerance: @settings[:text_y_tolerance],
    keep_blank_chars: @settings[:text_keep_blank_chars]
  }.merge(text_opts)

  tables.map { |t| t.extract(**merged) }
end

#intersectionsObject



90
91
92
93
94
95
96
# File 'lib/rpdfium/table/extractor.rb', line 90

def intersections
  @intersections ||= Edges.edges_to_intersections(
    edges,
    x_tolerance: @settings[:intersection_x_tolerance],
    y_tolerance: @settings[:intersection_y_tolerance]
  )
end

#tablesObject Also known as: find



102
103
104
# File 'lib/rpdfium/table/extractor.rb', line 102

def tables
  @tables ||= Cells.cells_to_tables(cells).map { |group| Table.new(@page, group) }
end