Class: IiifPrint::TextExtraction::PageOCR

Inherits:
Object
  • Object
show all
Defined in:
lib/iiif_print/text_extraction/page_ocr.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options) ⇒ PageOCR

Returns a new instance of PageOCR.



12
13
14
15
16
17
18
19
20
21
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 12

def initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options)
  @path = path
  # hOCR html:
  @html = nil
  @words = nil
  @source_meta = nil
  @box = nil
  @plain = nil
  @additional_tessearct_options = additional_tessearct_options
end

Instance Attribute Details

#htmlObject

Returns the value of attribute html.



10
11
12
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 10

def html
  @html
end

#pathObject

Returns the value of attribute path.



10
11
12
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 10

def path
  @path
end

Instance Method Details

#altoObject



70
71
72
73
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 70

def alto
  writer = IiifPrint::TextExtraction::RenderAlto.new(width, height)
  writer.to_alto(words)
end

#heightObject



66
67
68
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 66

def height
  identify[:height]
end

#identifyObject



57
58
59
60
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 57

def identify
  return @source_meta unless @source_meta.nil?
  @source_meta = IiifPrint::ImageTool.new(@path).
end

#load_wordsObject



31
32
33
34
35
36
37
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 31

def load_words
  preprocess_image
  html_path = run_ocr
  reader = IiifPrint::TextExtraction::HOCRReader.new(html_path)
  @words = reader.words
  @plain = reader.text
end

#plainObject



52
53
54
55
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 52

def plain
  load_words if @plain.nil?
  @plain
end

#run_ocrObject



23
24
25
26
27
28
29
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 23

def run_ocr
  outfile = File.join(Dir.mktmpdir, 'output_html')
  cmd = "tesseract #{path} #{outfile} hocr"
  cmd += " #{@additional_tessearct_options}" if @additional_tessearct_options.present?
  `#{cmd}`
  outfile + '.hocr'
end

#widthObject



62
63
64
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 62

def width
  identify[:width]
end

#word_jsonObject



44
45
46
47
48
49
50
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 44

def word_json
  IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
    words: words,
    width: width,
    height: height
  )
end

#wordsObject



39
40
41
42
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 39

def words
  load_words if @words.nil?
  @words
end