Class: Precise::Transcription

Inherits:
Object
  • Object
show all
Defined in:
lib/precise/transcription.rb,
lib/precise/transcription_a2r.rb,
lib/precise/transcription_r2a.rb

Constant Summary collapse

A2R =
A2RTable = {
  "ال": "al-",
  "ء": "ʾ",
  "آ": "ʾā",
  "أ": "ʾa",
  "أُ": "ʾu",
  "إ": "ʾi",
  "ا": "ā",
  "ب": "b",
  "ة": "a",
  "ت": "t",
  "ث": "",
  "ج": "ǧ",
  "ح": "",
  "خ": "",
  "د": "d",
  "ذ": "",
  "ر": "r",
  "ز": "z",
  "س": "s",
  "ش": "š",
  "ص": "",
  "ض": "",
  "ط": "",
  "ظ": "",
  "ع": "ʿ",
  "غ": "ġ",
  "ف": "f",
  "ق": "q",
  "ك": "k",
  "ل": "l",
  "م": "m",
  "ن": "n",
  "ه": "h",
  "و": ["ū", "w"],
  "ى": "á",
  "ي": ["ī", "y"],
  "َ": "a",
  "ُ": "u",
  "ِ": "i",
  "پ": "p",
  "چ": "č",
  "ژ": "ž",
  "گ": "g",
  "٠": "0",
  "١": "1",
  "٢": "2",
  "٣": "3",
  "٤": "4",
  "٥": "5",
  "٦": "6",
  "٧": "7",
  "٨": "8",
  "٩": "9",
}.map{|k,v| [k.to_s, v]}.to_h
SHADDA =
' ّ'.strip
AlifVariants =

the various forms of alif, ya and waw

['أ', 'إ', 'آ', 'ا', 'ٱ']
Tashkeel =

other character lists

("064B".to_i(16).."065B".to_i(16)).map{|dec| hex=("%04x" % dec); eval("char=\"\\u#{hex}\"; char")}
Nonprintables =
[R2LM, L2RM]
R2ATables =
{
  # Adapted from the Transcription in the Brill PDF's "Note to the Indices":
  # - a dash, depending on its position, denotes the start or end of the word
  # - an array denotes the requirement for a choice to be made from context
  # - any characters that are being replaced by DMG characters have been ommitted
  common: {
    ʾ:     ,
    b:     ,
    p:     ,
    t:     ,
    ḥ:     ,
    d:     ,
    r:     ,
    z:     ,
    s:     ,
    ṣ:     ,
    ḍ:     ,
    ṭ:     ,
    ẓ:     ,
    ʿ:     ,
    f:     ,
    q:     ,
    k:     ,
    g:     ,
    l:     ,
    m:     ,
    n:     ,
    h:     ,
    w:     ,
    y:     ,
    ā:     ,
    ū:     ,
    ī:     ,
  },
  vowels: {
    a:     Fatha,
    à:     Fatha, # at word-end only
    u:     Damma,
    i:     Kasra,
  },
  combos: {
    aw:    :َو,
    ay:    :َي
  },
  brockelmann: {
    '-a':  ,  # "-" = at word-end
    '-at': ,  # "-" = at word-end
    'al-': :ال, # "-" = at word-start
  },
  dmg: {
    ṯ:     ,
    ǧ:     ,
    č:     ,
    ḫ:     ,
    ḏ:     ,
    ž:     ,
    š:     ,
    ġ:     
  },
  uppercase: {
    A:     :أَ,
    I:     :إِ,
    U:     :أُ,
    Y:     
  },
  farsi: {
    v:     , # always? what, e.g. about "Divbandi"?
    e:     [, Fatha] # word-end, mid-word
  },
  turkic: {
    ö:     ,
    ü:     Damma, # ???
    ı:     Kasra, # ???
    E:     
  },
  indic: {
    ō:     # things like "Bh" => "بْ" would go here, too
  },
  romanic: {
    c:    , # or should this rather be a س?
    o:    ,
    Ė:    :إي,
    x:    :كس
  },
  semitic: {
    ē:    :ﺍ # is that always so?
  },
  finnic: {
    ä:    Fatha # in e.g. Mänglī
  },
  precise: {
    á:    ,
    Ā:    , # don't add 'ʾĀ' here - it is considered an error in the input!
    'ʾā':  # same but lowercase - alif madda in the middle of the word
  }
}
PostR2AWordReplacements =
{
  /^(.*)لّاه/ => '\1 الله', # names ending in "allah"
  /(ب\.|إبن|إِبن)/ => 'بن', # "son of"
  /أَبي/ => 'أبي', # "father of" (gen.)
  /أَبو/ => 'أبو', # "father of" (nom.)
  /بَكر/ => 'بكر', # the name "bakr"
  /عَلي/ => 'علي', # the name "ali"
  /عَبد/ => 'عبد', # the name-part "abd"
  /افندي/ => 'افندی' # ottoman/turkish effendi
  # /([یي]زاده$)/ => ZWNJ+'ی'+ZWNJ+'زاده', # names ending in "-azade" # removed at DK's request
}
PostR2AContextReplacements =
{
  /((^|\.\s+)بن(\s+))/ => 'ابن\3', # exception: son-of in beginning of sentence
  /(تِ|تُ|تَ)(\s+)/ => 'ة ', # this'll lose the case ending, but that's for the better
  /داوود/ => 'داود' # not sure if this might actually hold true for all ...wū...?
}
PunctSepRgx =
/[ \.\-\(\)\?\&=,;:]/
R2A =

just one level is enough now

R2ATables.values.inject(:merge) # just one level is enough now
.keys_and_values_to_s
SunLetters =

more convenient to work with

%w[t  d  r z s š     l n]
RomanizedShortVowels =
%w[a i u]
RomanizedLongVowels =
%w[ā ū ī]
RomanizedConsonantals =

“a” here because of ta’marbouta, “á” because of alif maqsoura, “ā” because of word-final alif mamdouda

SunLetters + %w[m l k q f ġ ʿ   h ǧ b ʾ a á]
ArabicScriptVowels =
%w[ا ي و]
ArabicScriptConsonants =
%w[ا ب ت ث ج ح خ س ش ص ض ط ظ ع غ ف ق ك ل م ن ه ي ئ ة ى أ إ ؤ ئ آ]
LatinChars =
R2A.map{|l,a| l unless l.size != 1}.compact
TranslitChars_lowercase =
'ʾʿḏḥṣḍṭẓāūīṯǧčḫžšġōĖēáäüöü'
TranslitChars =
(TranslitChars_lowercase + TranslitChars_lowercase.upcase).chars.uniq.join

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ Transcription

Returns a new instance of Transcription.



3
4
5
6
7
8
9
# File 'lib/precise/transcription.rb', line 3

def initialize(opts = {})
  default_options = {punctuation: true, verbosity: 0}
  @opts = default_options.merge(opts)
  @opts[:verbosity] += 2 if @opts.delete(:verbose) == true
  $dbg += @opts[:verbosity]
  @out_chunks = []
end

Class Method Details

.reverse(romanized, opts = {}) ⇒ Object



443
444
445
# File 'lib/precise/transcription_r2a.rb', line 443

def self.reverse(romanized, opts={})
  new(opts).reverse(romanized)
end

.transcribe(arabic, opts = {}) ⇒ Object



115
116
117
118
119
120
121
# File 'lib/precise/transcription_a2r.rb', line 115

def self.transcribe(arabic, opts={})
  warn "Romanisation is incomplete.".yellow
  warn "Consider adding short vowels by hand as needed.".yellow
  obj = new(opts)
  obj.transcribe(arabic)
  return obj.transcription
end

Instance Method Details

#alif_for_word_initial_kasra(word) ⇒ Object



251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# File 'lib/precise/transcription_r2a.rb', line 251

def alif_for_word_initial_kasra(word)
  # a,i,u = that specific short vowel
  # c = any consonantal
  # s = any short vowel
  # l = any long vowel
  patterns = [
    'iCClC',
    'iCCiCClC',
    'iClCC'
  ]
  # pp word
  shorts = RomanizedShortVowels
  longs = RomanizedLongVowels
  consonants = RomanizedConsonantals
  alif = Alif
  patterns.each do |p|
    # puts "> #{p}"
    next unless word.size == p.size
    match = true
    word.chars.each_with_index do |c,i|
      case p[i]
        when 'C' then match = false unless consonants.include?(c)
        when 's' then match = false unless shorts.include?(c)
        when 'l' then match = false unless longs.include?(c)
      else
        match = false unless c == p[i]
      end
      # puts "after #{c}: #{match} (should have been #{p[i]})"
    end
    (match = false if word.downcase.match?(/^ist/)) # استـ introduces 
    (alif = AlifHamzaBelow; break) if match
  end; puts "\t\tfor #{word}: word-initial #{alif}".light_blue if $dbg > 1
  alif
end

#hamza_after_preceding(ch, first_letter_of_word = false) ⇒ Object



234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# File 'lib/precise/transcription_r2a.rb', line 234

def hamza_after_preceding(ch, first_letter_of_word = false)
  if first_letter_of_word
    case ch.to_sym
      when :a then AlifHamzaAbove
      when :u then R2A['ā']+Damma+WawHamzaAbove
      when :i then R2A['ā']+YaHamzaAbove
    end
  else
    case ch.to_sym
      when :a then AlifHamzaAbove
      when :i then YaHamzaAbove
      when :u then WawHamzaAbove
      when  then YaHamzaAbove
    end
  end
end

#hamza_before_following(ch, pch, first_letter_of_word = false) ⇒ Object



202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# File 'lib/precise/transcription_r2a.rb', line 202

def hamza_before_following(ch, pch, first_letter_of_word = false)
  if first_letter_of_word
    case ch.to_sym
      when :a, :u then AlifHamzaAbove
      when :i then AlifHamzaBelow
      when  then AlifMadda
      when  then "#{YaHamzaAbove}#{R2A[ch]}"
      when  then "#{WawHamzaAbove}#{R2A[ch]}"
    end
  else
    if %w[y ī].include? pch
      # also take into account what PRECEDED the hamza - that might take precedence!
      case ch.to_sym
        when :a then YaHamzaAbove
        when :i then YaHamzaAbove
        when :u then WawHamzaAbove
        when  then "#{YaHamzaAbove}#{R2A[ch]}"
        when  then "#{WawHamzaAbove}#{R2A[ch]}"
      end
    else
      case ch.to_sym
        when :a then AlifHamzaAbove
        when :i then YaHamzaAbove
        when :u then
          pch == 'ū' ? R2A['ʾ'] : WawHamzaAbove
        when  then "#{YaHamzaAbove}#{R2A[ch]}"
        when  then "#{WawHamzaAbove}#{R2A[ch]}"
      end
    end
  end
end

#reverse(romanized) ⇒ Object

input: valid Precise string

example: (al-)ʿAbbādī Muḥammad Ibn Aḥmad Ibn Muḥammad al-Harawī

output: Arabic string

example: العَبّادي مُحَمَّد بن أَحمَد بن مُحَمَّد الهَرَوي


298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
# File 'lib/precise/transcription_r2a.rb', line 298

def reverse(romanized)
  raise Precise::NotATranscriptionError if romanized.nil?

  # sure, it's called "Precise", but it should still be 
  # as tolerant as possible in what it accepts as input...
  romanized = sanitize(romanized)
  arabic = '' # we start with an empty string and go character by character

  puts "- (#{romanized.size}) [#{romanized}]".light_green if $dbg > 1

  # next, turn strings into character arrays
  romanized = romanized.chars
  arabic = arabic.chars
  # to be able to merge 2 romanized characters into 1 arabic character
  skip = false
  # print string like so: ʿ·A·b·b·ā·d·ī· ·M·u·ḥ·a·m·m·a·d· ·I·b·n· ·A·ḥ·m·a·d· ·I·b·n· ...
  puts "- (#{romanized.size}) [#{romanized.join('·')}]".light_green if $dbg > 1

  # loop over the romanized character array, filling the arabic one up as we go
  romanized.each_with_index do |ch,i|
    # a little bit of context
     pch = i == 0 ? nil : romanized[i-1]
     fch = romanized[i+1]
    ffch = romanized[i+2]

    # multi-letter skip-aheads
    if skip
      dbg "\t\tskipping #{ch}"
      if !(pch=='a' && fch=='-') # we're in the middle of "al-" (word-start)
        skip=false; end; next; end

    # symbols to remove from input
    (dbg "\tskipping unprintable symbol"; next) if [ZWNJ].include?(ch)

    # deal with alif madda before "normal" hamza rules follow
    if ("#{ch}#{fch}".match?(/ʾā/) || "#{pch}#{ch}".match?(//))
      (dbg "\talif madda #{R2A['ʾā']}"; arabic << R2A['ʾā']; skip=true; next); end

    # hamza followed by a short or long vowel
    if ch == 'ʾ' && %w[a i u ā ī ū].include?(fch.to_s.downcase)
      is_first_letter_of_word = (pch.nil? || pch.match(/\s+/))
      (dbg "\t#{ch} with following #{fch}";
       arabic << hamza_before_following(fch, pch, is_first_letter_of_word);
       skip=true unless this_word(romanized.join, i).match?(/(a$|at($|\s))/)
       next); end
    # hamza preceded by a short vowel
    # (beware of a possible alif madda (would be dealt with above, on the next round))
    if fch.to_s == 'ʾ' && !ffch.to_s.match?(/[āĀ]/) && %w[a i u].include?(ch.downcase)
      is_first_letter_of_word = (pch.nil? || pch.match(/\s+/))
      (dbg "\t#{fch} carried on or following preceding #{ch}"
       arabic << hamza_after_preceding(ch, is_first_letter_of_word); skip=true; next); end

    # find the article "al", marked by having a dash appended to it
    (dbg "\tarticle al- #{R2A['al-']}"; arabic << R2A['al-']; skip=true; next) if ("#{ch}#{fch}#{ffch}" == 'al-')

    # unconditionally add spaces, dots and dashes to the output
    (dbg "\tinitial only (#{pch}#{ch})"; arabic << ch; next) if ch=='.' && (fch.nil? || fch.match(/\s+/))
    (dbg "\tnon-letter (#{ch})"; arabic << ch; next) if ch.match(PunctSepRgx) # white space or punctuation

    # a word-initial "a" or "u" must always be preceded by "ʾ"; only "i" can possibly *not* have one

    # deal with word-initial special cases
    if pch.to_s.strip.empty? # either beginning of string or of word
      if %w[a u].include?(ch)  
        (dbg "\tprepending #{ch} with hamza"; arabic << R2A[ch.upcase]; next); end
      if ch == 'i'
        (dbg "\thamza-less alif?"
         context = this_word(romanized.join, i)
         arabic << alif_for_word_initial_kasra(context.split(/^w?al-/).last)
         next); end; end

    # perform tashdeed
    (out=R2A[ch]+Shadda; dbg "\ttashdeed of #{ch} #{out}"; arabic << out; skip = true; next) if R2A[ch] && ch==fch

    # should there be a ta'marbouta or not at the end of the word?
    context1 = this_word(romanized.join,i)
    context2 = this_word_and_the_next(romanized.join,i)
    if context1 == context2 # single word
      if (i == context1.length-2 && "#{ch}#{fch}".match?(/at$/)) \
         || (i == context1.length-1 && "#{ch}#{fch}".match?(/a$/))
         arabic << R2A['-at']+' '; skip=true; next
      end
    else # multiple words
      if (i == context1.length-2 && "#{ch}#{fch}#{ffch}".match?(/at\s/))
        arabic << R2A['-a']+' '; skip = true; next
      elsif (i == context1.length-1 && "#{ch}#{fch}".match?(/a\s/))
        arabic << R2A['-a']+' '; next
      end
    end

    # letter ayn followed by uppercase vowel
    if ch == 'ʿ'
      (skip=true; ar=R2A[ch]) if %w[A I U].include?(fch)
      case fch # ayn+following vowel at beginning of word
        when 'A' then ar+=Fatha
        when 'I' then ar+=Kasra
        when 'U' then ar+=Damma; end; end
    (dbg "\tayn+vowel #{ch}#{fch} #{ar}"; arabic << ar; next) if ar && ar.size==2

    # long "a" at word-end: alif maqsoorah, otherwise normal alif
    # "e" at word-end: letter hah, otherwise just a fatha
    if R2A[ch].class == Array
      choice = (fch.nil? || fch==' ') ? R2A[ch].first : R2A[ch].last
      (dbg "\tcontextual #{ch} #{choice}"; arabic << choice; next); end

    # exact match (pure transliteration, no transcription effort required)
    (dbg "\tfrom table #{ch}#{R2A[ch]}"; arabic << R2A[ch]; next) if R2A[ch]

    # no luck yet; might be a regular uppercase letter
    (dbg "\tuppercased #{ch} #{R2A[ch.downcase]}"; arabic << R2A[ch.downcase]; next) if R2A[ch.downcase]

    # still no luck; last shot is punctuation
    (dbg "\tinterpunctuation #{ch}"; arabic << ch; next) if ch.match?(/[[:punct:]]/)

    # mark unknown characters as such; the philosophy here being that input to
    # Precise should be pre-processed enough for this to never have to happen…
    warn "Warning: character '#{ch}' is unknown to Precise and will be substituted by placeholder only".yellow
    arabic << ''
  end

  # character-array to word-array
  arabic = arabic.compact.join.split
  # العأَبّادي محمّد إِبن أَحمد إِبن محمّد للهروي (but with () around "al")
  puts "- (#{arabic.join(' ').size-1}) [#{L2RM+arabic.join(' ')+L2RM}]".light_green if $dbg > 1

  # dragnet replacement of special words, such as changing "ibn" into "bin"
  2.times.each_with_index do |i|
    puts "#{' '*6}(postprocessing round #{i+1})".light_green if $dbg > 1
    PostR2AWordReplacements.each{|rgx,subst|
      arabic.map!{|w|
        puts "#{' '*8}word match: #{L2RM}#{rgx.inspect} #{L2RM}=> #{L2RM}'#{subst}'".green if (w.match(rgx) && $dbg > 1)
        w.gsub(/-/, '') # dashes not needed anymore now
         .gsub(rgx, subst)} }
  end

  # some rules apply only in the context of words, not letters
  puts "- (#{arabic.join(' ').size-1}) [#{L2RM+arabic.join(' ')+L2RM}]".light_green if $dbg > 1
  arabic = arabic.join(' ')
  PostR2AContextReplacements.each{|rgx,subst|
    puts "#{' '*8}context match: #{L2RM}#{rgx.inspect} #{L2RM}=> #{L2RM}'#{subst}'".green if (arabic.match(rgx) && $dbg > 1)
    arabic.gsub!(rgx, subst) }

  return arabic.apply_options(@opts)
end

#sanitize(str) ⇒ Object



286
287
288
289
290
291
292
# File 'lib/precise/transcription_r2a.rb', line 286

def sanitize(str)
  # remove nonprintables such as the ZWNJ
  # FIXME: the erroneous_chars replacement table should have already taken care of this?!
  ["\u200c", "\u200f"].each{|ch| str.gsub! ch, ''}
  # make letters following either ʿ or ʾ lowercase
  lastc=''; str.chars.map{|c| c.downcase! if lastc.match?(/[ʿʾ]/); lastc=c}.join
end

#this_word(str, idx) ⇒ Object



186
187
188
# File 'lib/precise/transcription_r2a.rb', line 186

def this_word(str, idx)
  str[0...idx][/\S*\z/] + (str[idx..-1][/\A[#{TranslitChars}\w]+/] || '')
end

#this_word_and_the_next(str, idx) ⇒ Object



190
191
192
193
194
195
196
197
198
199
200
# File 'lib/precise/transcription_r2a.rb', line 190

def this_word_and_the_next(str, idx)
  # first part: from beginning of string to index position, get all non-whitespace characters
  # second part: from index position to end of string,
  #              get all characters belonging to the word which the index position character belongs to,
  #              as well as the next word if any
  if str.match?(/\s+/)
    str[0...idx][/\S*\z/] + (str[idx..-1][/\A[#{@translit_chars}\w]+\s+[#{@translit_chars}\w]+/i] || '')
  else
    str
  end
end

#transcribe(arabic) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/precise/transcription_a2r.rb', line 83

def transcribe(arabic)
  non_word_rgx = /([\s\d[:punct:]]+)/
  in_chunks = arabic.split non_word_rgx
  in_chunks.each.with_index do |chunk,i|
    word = chunk
    (next) if chunk.strip.empty?
    (@out_chunks << chunk.strip; next) if chunk.match? non_word_rgx
    chars = chunk.chars
    skip = 0
    (@out_chunks << '')
    chars.each.with_index do |ch,j|
      (skip-=1; next) if skip>0
      (@out_chunks[-1] << A2R['ال']; skip+=1; next) if j==0 && word.match?(/^ال/)
      out_char = nil
      # و and ي:
      # first in array is a long vowel,
      # second in array is a consonant
      if A2R[ch].class==Array
        if j==0 || j+1==word.length
          (@out_chunks[-1] << A2R[ch][-1]; next)
        else
          out_char = A2R[ch][0]
        end
      else
        out_char = A2R[ch]
      end
      (@out_chunks[-1] << A2R[chars[j-1]]) if ch == SHADDA
      (@out_chunks[-1] << out_char; next) if out_char
    end
  end
end

#transcriptionObject



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/precise/transcription_a2r.rb', line 6

def transcription
  @out_chunks
    .map{|c| c
      .gsub(/^m$/, 'mīlādī')
      .gsub(/^h$/, 'hijrī')
      .gsub(/^wāltī$/, 'wa-l-lātī')
      .gsub(/^wālḏī$/, 'wa-l-lāḏī')
      .gsub(/^hy$/, 'hiya')
      .gsub(/^ʿlá$/, 'ʿalá')
      .gsub(/^mn$/, 'min')
      .gsub(/^yd$/, 'yad')
      .gsub(/^fy$/, '')
      .gsub(/^lhā$/, 'lahā')}
    .join(' ')
    .gsub('؟','?')
    .gsub('،',',')
    .gsub(/\s+([[:punct:]]+)/,'\1')
    .gsub(/(?!(\s+|^))\(\s+/, ' (')
end