Class: SparkConnect::DataFrameStatFunctions

Inherits:
Object
  • Object
show all
Defined in:
lib/spark_connect/stat_functions.rb

Overview

Statistical helpers, returned by SparkConnect::DataFrame#stat. Mirrors PySpark’s ‘DataFrame.stat` (`DataFrameStatFunctions`).

Examples:

df.stat.corr("x", "y")
df.stat.approx_quantile("x", [0.25, 0.5, 0.75], 0.01)
df.stat.crosstab("a", "b").show

Constant Summary collapse

Proto =
SparkConnect::Proto

Instance Method Summary collapse

Constructor Details

#initialize(df) ⇒ DataFrameStatFunctions

Returns a new instance of DataFrameStatFunctions.

Parameters:



15
16
17
# File 'lib/spark_connect/stat_functions.rb', line 15

def initialize(df)
  @df = df
end

Instance Method Details

#approx_quantile(cols, probabilities, relative_error) ⇒ Array<Float>+

Approximate quantiles of numeric columns.

Parameters:

  • cols (String, Array<String>)
  • probabilities (Array<Float>)

    values in 0.0..1.0.

  • relative_error (Float)

Returns:

  • (Array<Float>, Array<Array<Float>>)

    one list per column.



54
55
56
57
58
59
60
61
62
63
# File 'lib/spark_connect/stat_functions.rb', line 54

def approx_quantile(cols, probabilities, relative_error)
  single = !cols.is_a?(Array)
  rel = Proto::StatApproxQuantile.new(
    input: @df.relation, cols: Array(cols).map(&:to_s),
    probabilities: probabilities, relative_error: relative_error
  )
  row = @df.build(approx_quantile: rel).collect.first
  result = row.to_a
  single ? result.first : result
end

#corr(col1, col2, method = "pearson") ⇒ Float

Correlation of two columns (‘method` is `“pearson”`).

Returns:

  • (Float)


27
28
29
30
# File 'lib/spark_connect/stat_functions.rb', line 27

def corr(col1, col2, method = "pearson")
  rel = Proto::StatCorr.new(input: @df.relation, col1: col1.to_s, col2: col2.to_s, method: method)
  scalar(@df.build(corr: rel))
end

#cov(col1, col2) ⇒ Float

Sample covariance of two columns.

Returns:

  • (Float)


21
22
23
# File 'lib/spark_connect/stat_functions.rb', line 21

def cov(col1, col2)
  scalar(@df.build(cov: Proto::StatCov.new(input: @df.relation, col1: col1.to_s, col2: col2.to_s)))
end

#crosstab(col1, col2) ⇒ DataFrame

Contingency table (cross-tabulation) of two columns.

Returns:



34
35
36
# File 'lib/spark_connect/stat_functions.rb', line 34

def crosstab(col1, col2)
  @df.build(crosstab: Proto::StatCrosstab.new(input: @df.relation, col1: col1.to_s, col2: col2.to_s))
end

#freq_items(cols, support = 0.01) ⇒ DataFrame

Frequent items in the given columns.

Parameters:

  • cols (Array<String>)
  • support (Float) (defaults to: 0.01)

Returns:



43
44
45
46
# File 'lib/spark_connect/stat_functions.rb', line 43

def freq_items(cols, support = 0.01)
  rel = Proto::StatFreqItems.new(input: @df.relation, cols: Array(cols).map(&:to_s), support: support)
  @df.build(freq_items: rel)
end

#sample_by(col, fractions, seed = nil) ⇒ DataFrame

Stratified sample without replacement, keyed by ‘col`.

Parameters:

  • col (String, Column)
  • fractions (Hash{Object=>Float})

    per-stratum sampling fraction.

  • seed (Integer, nil) (defaults to: nil)

Returns:



71
72
73
74
75
76
77
78
79
# File 'lib/spark_connect/stat_functions.rb', line 71

def sample_by(col, fractions, seed = nil)
  col_expr = (col.is_a?(Column) ? col : Functions.col(col.to_s)).to_expr
  frac = fractions.map do |stratum, fraction|
    Proto::StatSampleBy::Fraction.new(stratum: Column.to_literal(stratum), fraction: fraction)
  end
  rel = Proto::StatSampleBy.new(input: @df.relation, col: col_expr, fractions: frac)
  rel.seed = seed if seed
  @df.build(sample_by: rel)
end