Class: PROIEL::Commands::Tokenize

Inherits:
PROIEL::Command show all
Defined in:
lib/proiel/cli/commands/tokenize.rb

Constant Summary collapse

VALID_METADATA_FIELDS =
%w(title author citation_part language id

principal funder distributor distributor_address date
license license_url
reference_system
editor editorial_note
annotator reviewer

electronic_text_editor electronic_text_title
electronic_text_version
electronic_text_publisher electronic_text_place electronic_text_date
electronic_text_original_url
electronic_text_license electronic_text_license_url

printed_text_editor printed_text_title
printed_text_edition
printed_text_publisher printed_text_place printed_text_date)

Class Method Summary collapse

Methods inherited from PROIEL::Command

inherited, subclasses

Class Method Details

.init_with_program(prog) ⇒ Object



5
6
7
8
9
10
11
12
13
# File 'lib/proiel/cli/commands/tokenize.rb', line 5

def init_with_program(prog)
  prog.command(:tokenize) do |c|
    c.syntax 'tokenize'
    c.description 'Tokenize raw text'
    c.syntax '[options] filename'

    c.action { |args, options| process(args, options) }
  end
end

.process(args, options) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/proiel/cli/commands/tokenize.rb', line 15

def process(args, options)
  if args.empty?
    STDERR.puts 'Missing filename. Use --help for more information.'
    exit 1
  end

  if args.length > 1
    STDERR.puts 'Too many filenames. Use --help for more information.'
    exit 1
  end

  builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
  builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'

  filename = args.first

  File.open(filename, 'r') do |file|
    header = read_header(file)
    body = read_body(file)

    builder.proiel('export-time' => header.export_time, 'schema-version' => '2.0') do
      builder.source(id: header.id, language: header.language) do
        builder.title header.title
        builder.author header.author
        builder.tag!('citation-part', header.citation_part)

        tokenize(builder, body)
      end
    end
  end
end

.read_body(f) ⇒ Object



143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/proiel/cli/commands/tokenize.rb', line 143

def read_body(f)
  f.rewind

  Array.new.tap do |bdy|
    f.each_line do |l|
      case l
      when /^%/
        # Ignore header
      when /^\s*$/
        # Ignore empty lines
      when /^#/
        # New source division started
        bdy << { title: l.sub(/^#/, '').strip, contents: '' }
      else
        bdy << { title: '', contents: '' } if bdy.empty?
        bdy.last[:contents] += l
      end
    end
  end
end

.read_header(f) ⇒ Object



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/proiel/cli/commands/tokenize.rb', line 116

def read_header(f)
  f.rewind

  OpenStruct.new.tap do |hdr|
    # We expect a header first, each line starting with %, and we
    # assume that the header ends with the first line that does
    # not start with %.
    f.each_line do |l|
      l.chomp!

      case l
      when /^%/
        field, value = l.sub(/^%\s*/, '').split(/\s*=\s*/, 2)

        case field
        when 'id', 'export_time', *VALID_METADATA_FIELDS
          hdr[field] = value.strip
        else
          STDERR.puts "Invalid header field #{field}. Ignoring.".yellow
        end
      else
        break
      end
    end
  end
end

.tokenize(builder, body) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/proiel/cli/commands/tokenize.rb', line 47

def tokenize(builder, body)
  citation_part = nil

  body.each_with_index do |sd_body, i|
    builder.div do
      builder.title sd_body[:title]
      sd_body[:contents].split(/(@[^ ]+|§[^ ]+ )/).map do |s|
        if s[0] == '§' or s[0] == '@'
          s
        else
          # It's sensible to place the break not immediately after probable
          # sentence-breaking punctuation like periods and question marks, but
          # after the punctuation mark and characters typically used in pairs,
          # like brackets and apostrophes.
          s.gsub(/([\.:;\?!]+[\s†\]\)"']*|\s*[\n\r]+)/, '\1|')
        end
      end.join.split('|').each_with_index do |s_body, j|
        builder.sentence(status: 'unannotated') do
          leftover_before = ''

          # Preserve linebreaks in the text.
          s_body.gsub!(/\s*[\n\r]+/, "\u2028")

          s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+ )([^@§\p{Word}]*)/).each do |(before, form, after)|
            case form
            when /^@(.*)$/
              leftover_before += before unless before.nil?
              leftover_before += $1
              leftover_before += after unless after.nil?
            when /^§(.*)$/
              leftover_before += before unless before.nil?
              citation_part = $1.strip
              leftover_before += after unless after.nil?
            else
              before = leftover_before + before
              leftover_before = ''

              attrs = { :"citation-part" => citation_part, form: form }
              attrs[:"presentation-before"] = before unless before == ''
              attrs[:"presentation-after"] = after unless after == ''

              builder.token(attrs)
            end
          end
        end
      end
    end
  end
end