Class: IiifPrint::TextExtraction::PageOCR
- Inherits:
-
Object
- Object
- IiifPrint::TextExtraction::PageOCR
- Defined in:
- lib/iiif_print/text_extraction/page_ocr.rb
Instance Attribute Summary collapse
-
#html ⇒ Object
Returns the value of attribute html.
-
#path ⇒ Object
Returns the value of attribute path.
Instance Method Summary collapse
- #alto ⇒ Object
- #height ⇒ Object
- #identify ⇒ Object
-
#initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options) ⇒ PageOCR
constructor
A new instance of PageOCR.
- #load_words ⇒ Object
- #plain ⇒ Object
- #run_ocr ⇒ Object
- #width ⇒ Object
- #word_json ⇒ Object
- #words ⇒ Object
Constructor Details
#initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options) ⇒ PageOCR
Returns a new instance of PageOCR.
12 13 14 15 16 17 18 19 20 21 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 12 def initialize(path, additional_tessearct_options: IiifPrint.config.) @path = path # hOCR html: @html = nil @words = nil @source_meta = nil @box = nil @plain = nil @additional_tessearct_options = end |
Instance Attribute Details
#html ⇒ Object
Returns the value of attribute html.
10 11 12 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 10 def html @html end |
#path ⇒ Object
Returns the value of attribute path.
10 11 12 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 10 def path @path end |
Instance Method Details
#alto ⇒ Object
70 71 72 73 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 70 def alto writer = IiifPrint::TextExtraction::RenderAlto.new(width, height) writer.to_alto(words) end |
#height ⇒ Object
66 67 68 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 66 def height identify[:height] end |
#identify ⇒ Object
57 58 59 60 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 57 def identify return @source_meta unless @source_meta.nil? @source_meta = IiifPrint::ImageTool.new(@path). end |
#load_words ⇒ Object
31 32 33 34 35 36 37 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 31 def load_words preprocess_image html_path = run_ocr reader = IiifPrint::TextExtraction::HOCRReader.new(html_path) @words = reader.words @plain = reader.text end |
#plain ⇒ Object
52 53 54 55 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 52 def plain load_words if @plain.nil? @plain end |
#run_ocr ⇒ Object
23 24 25 26 27 28 29 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 23 def run_ocr outfile = File.join(Dir.mktmpdir, 'output_html') cmd = "tesseract #{path} #{outfile} hocr" cmd += " #{@additional_tessearct_options}" if @additional_tessearct_options.present? `#{cmd}` outfile + '.hocr' end |
#width ⇒ Object
62 63 64 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 62 def width identify[:width] end |
#word_json ⇒ Object
44 45 46 47 48 49 50 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 44 def word_json IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for( words: words, width: width, height: height ) end |
#words ⇒ Object
39 40 41 42 |
# File 'lib/iiif_print/text_extraction/page_ocr.rb', line 39 def words load_words if @words.nil? @words end |