Class: Rsssf::Page

Inherits:
Object
  • Object
show all
Includes:
Utils
Defined in:
lib/rsssf/page.rb,
lib/rsssf/page-meta.rb,
lib/rsssf/page-find_schedule.rb

Overview

note:

 a rsssf page may contain:
  many leagues, cups
  - tables, schedules (rounds), notes, etc.

a rsssf page MUST be in plain text (.txt) and utf-8 character encoding assumed

Constant Summary collapse

OPT_REF =

let’s you check optional ref e.g. ‹§fin›

todo/fix - change to OPT_REF_RE   - make it regex
   regex embedded in regex will use  regex.source automatic (no need to escape)!!

let’s you check optional ref e.g. ‹§fin›

%q{
   (?: [ ]*
     ‹§ (?<ref> [^›]+?) ›
   )?
}
HX_RE =
%r{          ## negative lookahead
         ##   do NOT match  =-=
         ##   do NOT match  ===========  (without any heading text!!)
         ##     e.g.
         ##       Fall season
         ##       ===========

        (?! ^[ ]* (?:    =-=
                     |  ={1,} [ ]* $
                   )
         )

         ^
        [ ]*

      (?<marker> ={1,6})
         [ ]*
      (?<text> .+?)
         #{OPT_REF}
         [ ]*
$}x
HTML_COMMENT_HEADER_RE =

note - A - start of string

comment must start .txt document!!!
%r{  \A
         [ \n]*  ## trailing spaces and blank lines
    <!--
         [ \n]*
       (?<text> .+?)
         [ \n]*
     -->
}imx
HEADER_RE =

note - starts at

%r{          ## negative lookahead
         ##   do NOT match  =-=
         ##   do NOT match  ===========  (without any heading text!!)
         ##     e.g.
         ##       Fall season
         ##       ===========

        (?! ^[ ]* (?:    =-=
                     |  ={1,} [ ]* $
                   )
         )

         ^
        [ ]*
      (?<marker> ={1,6})
         [ ]*
      (?<text> .+?)
         #{OPT_REF}
         [ ]*
$}x

Constants included from Utils

Utils::YEAR_FROM_NAME_RE

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Utils

#archive_dir_for_season, #year_from_file, #year_from_name

Constructor Details

#initialize(txt) ⇒ Page

Returns a new instance of Page.



56
57
58
59
# File 'lib/rsssf/page.rb', line 56

def initialize( txt )
  @txt   = txt
  @url   = nil
end

Instance Attribute Details

#txtObject

use text alias too (for txt) - why? why not?



52
53
54
# File 'lib/rsssf/page.rb', line 52

def txt
  @txt
end

#urlObject

source url



53
54
55
# File 'lib/rsssf/page.rb', line 53

def url
  @url
end

Class Method Details

.parse_meta(txt) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/rsssf/page-meta.rb', line 40

def self.parse_meta( txt )
     meta = {}
     m = HTML_COMMENT_HEADER_RE.match( txt )
     if m
        text = m[:text]
        text.each_line do |line|
            line = line.strip

            ## note - allow "inline" blank lines and comment lines (starting w/ #)
            next if line.empty?  || line.start_with?('#')

            ## split line on first colon (:) (only)
            ##   note - limit split to two pieces!!!
            key, value = line.split( /[ ]*:[ ]*/, 2)
            ## use a symbol (not string) as key - why? why not?
            meta[ key.to_sym ] = value
        end
        meta
     else
        nil ## no meta data (comment header) found
     end
end

.read_cache(url) ⇒ Object

use read_cache /web/html or such - why? why not?



30
31
32
33
34
35
36
37
38
39
40
# File 'lib/rsssf/page.rb', line 30

def self.read_cache( url )  ### use read_cache /web/html or such - why? why not?
  html = Webcache.read( url )

  puts "html:"
  pp html[0..400]

  txt = PageConverter.convert( html, url: url )
  txt

  new( txt )
end

.read_txt(path) ⇒ Object

use read_txt



43
44
45
46
47
# File 'lib/rsssf/page.rb', line 43

def self.read_txt( path )  ## use read_txt
    # note: always assume sources (already) converted from html to txt!!!!
  txt = read_text( path )
  new( txt )
end

Instance Method Details

#_build_toc(txt) ⇒ Object



106
107
108
109
110
111
112
113
114
115
# File 'lib/rsssf/page.rb', line 106

def _build_toc( txt )

     hx =  txt.scan( HX_RE )

     toc = []
       hx.each do |marker,text,ref|
          toc <<  "#{marker} #{text}"
       end
     toc
end

#_find_schedule(header:, strict: false) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/rsssf/page-find_schedule.rb', line 76

def _find_schedule( header:, strict: false )
    ## make sure header is an array
    header = [header]    if header.is_a?( String )

    txt = _walk_sections( @txt, header: header,
                                depth:  0,
                                strict: strict )

    if txt
        ## wrap in schedule class - why? why not?
        schedule = Schedule.new( txt )
        schedule
    else
       nil
    end
end

#_scan_headingsObject

change to outline - why? why not?



102
# File 'lib/rsssf/page.rb', line 102

def _scan_headings()  txt.scan( HX_RE );   end

#_split_sections(txt, level: 2) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/rsssf/page-find_schedule.rb', line 42

def _split_sections( txt, level: 2 )

  sections = {}
  current  = nil

  txt.each_line do |line|
    if m=HEADER_RE.match( line )
        header_level  = m[:marker].size
        header_text   = m[:text]
        if header_level == level
           current = String.new
           sections[ header_text ] = current
           next
        end
    end

    current << line    if current
  end

  sections
end

#_walk_sections(txt, header:, depth:, strict: false) ⇒ Object



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/rsssf/page-find_schedule.rb', line 94

def _walk_sections( txt, header:,
                         depth:,
                         strict: false )

   query      =  header[depth]
   query_next =  header[depth+1]

   ## note - start at level 2
   sections = _split_sections( txt, level: depth+2 )

   txt = sections[ query ]
   if txt
       if query_next
         txt = _walk_sections( txt, header: header,
                                    depth: depth+1,
                                    strict: strict )
         txt
       else
         txt
       end
   else
      if strict
        ## note - return nil if not found!!!
        raise ArgumentError, "section with header >#{query}< not found; sections incl. #{sections.keys}"
      else
        nil
      end
   end
end

#build_statObject



132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/rsssf/page.rb', line 132

def build_stat
  title        = nil
  source       = nil
  authors      = nil
  last_updated = nil

  meta = parse_meta( @txt ) || {}

  title        = meta[:title]
  source       = meta[:source]
  authors      = meta[:author] || meta[:authors]   ## note - check for author & authors !!!
  last_updated = meta[:updated]


  puts "*** !!! missing source"        if source.nil?
  puts "*** !!! missing author(s)"     if authors.nil?
  puts "**  !!! missing last updated"  if last_updated.nil?


  ## get year from source (url)
  ###   move (for reuse) to  year_from_url  in utils - why? why not?
  url_path  = URI.parse( source ).path
  basename  = File.basename( url_path, File.extname( url_path ) )  ## e.g. duit92.txt or duit92.html => duit92
  puts "   basename=>#{basename}<"
  year      = year_from_name( basename )


  sections = _build_toc( txt )



  rec = PageStat.new
  rec.source       = source         # e.g. http://rsssf.org/tabled/duit89.html   -- use source_url - why?? why not??
  rec.year         = year       ## note: in 2021/22  - year is always end_year, that is, 2022
  rec.title        = title
  rec.authors      = authors
  rec.last_updated = last_updated
  rec.line_count   = @txt.lines.count    ### or @txt.each_line.count
  rec.char_count   = @txt.size          ## note - size/length is true char count (@txt.bytesize is byte count!!)
  rec.sections     = sections

  rec
end

#find_schedule!(header:) ⇒ Object

make header required - yes

change to build_schedule - why? why not???
 add level: 2 or such - why? why not?


71
72
73
# File 'lib/rsssf/page-find_schedule.rb', line 71

def find_schedule!( header: )
    _find_schedule( header: header, strict: true )
end

#parse_meta(txt) ⇒ Object



62
# File 'lib/rsssf/page-meta.rb', line 62

def parse_meta( txt ) self.class.parse_meta( txt ); end

#save(path) ⇒ Object



177
178
179
# File 'lib/rsssf/page.rb', line 177

def save( path )
  write_text( path, @txt )
end