Class: Precise::Transcription
- Inherits:
-
Object
- Object
- Precise::Transcription
- Defined in:
- lib/precise/transcription.rb,
lib/precise/transcription_a2r.rb,
lib/precise/transcription_r2a.rb
Constant Summary collapse
- A2R =
A2RTable = { "ال": "al-", "ء": "ʾ", "آ": "ʾā", "أ": "ʾa", "أُ": "ʾu", "إ": "ʾi", "ا": "ā", "ب": "b", "ة": "a", "ت": "t", "ث": "ṯ", "ج": "ǧ", "ح": "ḥ", "خ": "ḫ", "د": "d", "ذ": "ḏ", "ر": "r", "ز": "z", "س": "s", "ش": "š", "ص": "ṣ", "ض": "ḍ", "ط": "ṭ", "ظ": "ẓ", "ع": "ʿ", "غ": "ġ", "ف": "f", "ق": "q", "ك": "k", "ل": "l", "م": "m", "ن": "n", "ه": "h", "و": ["ū", "w"], "ى": "á", "ي": ["ī", "y"], "َ": "a", "ُ": "u", "ِ": "i", "پ": "p", "چ": "č", "ژ": "ž", "گ": "g", "٠": "0", "١": "1", "٢": "2", "٣": "3", "٤": "4", "٥": "5", "٦": "6", "٧": "7", "٨": "8", "٩": "9", }.map{|k,v| [k.to_s, v]}.to_h
- SHADDA =
' ّ'.strip
- AlifVariants =
the various forms of alif, ya and waw
['أ', 'إ', 'آ', 'ا', 'ٱ']
- Tashkeel =
other character lists
("064B".to_i(16).."065B".to_i(16)).map{|dec| hex=("%04x" % dec); eval("char=\"\\u#{hex}\"; char")}
- Nonprintables =
[R2LM, L2RM]
- R2ATables =
{ # Adapted from the Transcription in the Brill PDF's "Note to the Indices": # - a dash, depending on its position, denotes the start or end of the word # - an array denotes the requirement for a choice to be made from context # - any characters that are being replaced by DMG characters have been ommitted common: { ʾ: :ء, b: :ب, p: :پ, t: :ت, ḥ: :ح, d: :د, r: :ر, z: :ز, s: :س, ṣ: :ص, ḍ: :ض, ṭ: :ط, ẓ: :ظ, ʿ: :ع, f: :ف, q: :ق, k: :ك, g: :گ, l: :ل, m: :م, n: :ن, h: :ه, w: :و, y: :ي, ā: :ا, ū: :و, ī: :ي, }, vowels: { a: Fatha, à: Fatha, # at word-end only u: Damma, i: Kasra, }, combos: { aw: :َو, ay: :َي }, brockelmann: { '-a': :ة, # "-" = at word-end '-at': :ة, # "-" = at word-end 'al-': :ال, # "-" = at word-start }, dmg: { ṯ: :ث, ǧ: :ج, č: :چ, ḫ: :خ, ḏ: :ذ, ž: :ژ, š: :ش, ġ: :غ }, uppercase: { A: :أَ, I: :إِ, U: :أُ, Y: :ي }, farsi: { v: :و, # always? what, e.g. about "Divbandi"? e: [:ه, Fatha] # word-end, mid-word }, turkic: { ö: :و, ü: Damma, # ??? ı: Kasra, # ??? E: :ا }, indic: { ō: :و # things like "Bh" => "بْ" would go here, too }, romanic: { c: :ث, # or should this rather be a س? o: :و, Ė: :إي, x: :كس }, semitic: { ē: :ﺍ # is that always so? }, finnic: { ä: Fatha # in e.g. Mänglī }, precise: { á: :ى, Ā: :آ, # don't add 'ʾĀ' here - it is considered an error in the input! 'ʾā': :آ # same but lowercase - alif madda in the middle of the word } }
- PostR2AWordReplacements =
{ /^(.*)لّاه/ => '\1 الله', # names ending in "allah" /(ب\.|إبن|إِبن)/ => 'بن', # "son of" /أَبي/ => 'أبي', # "father of" (gen.) /أَبو/ => 'أبو', # "father of" (nom.) /بَكر/ => 'بكر', # the name "bakr" /عَلي/ => 'علي', # the name "ali" /عَبد/ => 'عبد', # the name-part "abd" /افندي/ => 'افندی' # ottoman/turkish effendi # /([یي]زاده$)/ => ZWNJ+'ی'+ZWNJ+'زاده', # names ending in "-azade" # removed at DK's request }
- PostR2AContextReplacements =
{ /((^|\.\s+)بن(\s+))/ => 'ابن\3', # exception: son-of in beginning of sentence /(تِ|تُ|تَ)(\s+)/ => 'ة ', # this'll lose the case ending, but that's for the better /داوود/ => 'داود' # not sure if this might actually hold true for all ...wū...? }
- PunctSepRgx =
/[ \.\-\(\)\?\&=,;:]/
- R2A =
just one level is enough now
R2ATables.values.inject(:merge) # just one level is enough now .keys_and_values_to_s
- SunLetters =
more convenient to work with
%w[t ṯ d ḏ r z s š ṣ ḍ ṭ ẓ l n]
- RomanizedShortVowels =
%w[a i u]
- RomanizedLongVowels =
%w[ā ū ī]
- RomanizedConsonantals =
“a” here because of ta’marbouta, “á” because of alif maqsoura, “ā” because of word-final alif mamdouda
SunLetters + %w[m l k q f ġ ʿ ḫ ḥ h ǧ b ʾ a á]
- ArabicScriptVowels =
%w[ا ي و]
- ArabicScriptConsonants =
%w[ا ب ت ث ج ح خ س ش ص ض ط ظ ع غ ف ق ك ل م ن ه ي ئ ة ى أ إ ؤ ئ آ]
- LatinChars =
R2A.map{|l,a| l unless l.size != 1}.compact
- TranslitChars_lowercase =
'ʾʿḏḥṣḍṭẓāūīṯǧčḫžšġōĖēáäüöü'
- TranslitChars =
(TranslitChars_lowercase + TranslitChars_lowercase.upcase).chars.uniq.join
Class Method Summary collapse
Instance Method Summary collapse
- #alif_for_word_initial_kasra(word) ⇒ Object
- #hamza_after_preceding(ch, first_letter_of_word = false) ⇒ Object
- #hamza_before_following(ch, pch, first_letter_of_word = false) ⇒ Object
-
#initialize(opts = {}) ⇒ Transcription
constructor
A new instance of Transcription.
-
#reverse(romanized) ⇒ Object
input: valid Precise string example: (al-)ʿAbbādī Muḥammad Ibn Aḥmad Ibn Muḥammad al-Harawī output: Arabic string example: العَبّادي مُحَمَّد بن أَحمَد بن مُحَمَّد الهَرَوي.
- #sanitize(str) ⇒ Object
- #this_word(str, idx) ⇒ Object
- #this_word_and_the_next(str, idx) ⇒ Object
- #transcribe(arabic) ⇒ Object
- #transcription ⇒ Object
Constructor Details
#initialize(opts = {}) ⇒ Transcription
Returns a new instance of Transcription.
3 4 5 6 7 8 9 |
# File 'lib/precise/transcription.rb', line 3 def initialize(opts = {}) = {punctuation: true, verbosity: 0} @opts = .merge(opts) @opts[:verbosity] += 2 if @opts.delete(:verbose) == true $dbg += @opts[:verbosity] @out_chunks = [] end |
Class Method Details
.reverse(romanized, opts = {}) ⇒ Object
443 444 445 |
# File 'lib/precise/transcription_r2a.rb', line 443 def self.reverse(romanized, opts={}) new(opts).reverse(romanized) end |
.transcribe(arabic, opts = {}) ⇒ Object
115 116 117 118 119 120 121 |
# File 'lib/precise/transcription_a2r.rb', line 115 def self.transcribe(arabic, opts={}) warn "Romanisation is incomplete.".yellow warn "Consider adding short vowels by hand as needed.".yellow obj = new(opts) obj.transcribe(arabic) return obj.transcription end |
Instance Method Details
#alif_for_word_initial_kasra(word) ⇒ Object
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 |
# File 'lib/precise/transcription_r2a.rb', line 251 def alif_for_word_initial_kasra(word) # a,i,u = that specific short vowel # c = any consonantal # s = any short vowel # l = any long vowel patterns = [ 'iCClC', 'iCCiCClC', 'iClCC' ] # pp word shorts = RomanizedShortVowels longs = RomanizedLongVowels consonants = RomanizedConsonantals alif = Alif patterns.each do |p| # puts "> #{p}" next unless word.size == p.size match = true word.chars.each_with_index do |c,i| case p[i] when 'C' then match = false unless consonants.include?(c) when 's' then match = false unless shorts.include?(c) when 'l' then match = false unless longs.include?(c) else match = false unless c == p[i] end # puts "after #{c}: #{match} (should have been #{p[i]})" end (match = false if word.downcase.match?(/^ist/)) # استـ introduces (alif = AlifHamzaBelow; break) if match end; puts "\t\tfor #{word}: word-initial #{alif}".light_blue if $dbg > 1 alif end |
#hamza_after_preceding(ch, first_letter_of_word = false) ⇒ Object
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
# File 'lib/precise/transcription_r2a.rb', line 234 def hamza_after_preceding(ch, first_letter_of_word = false) if first_letter_of_word case ch.to_sym when :a then AlifHamzaAbove when :u then R2A['ā']+Damma+WawHamzaAbove when :i then R2A['ā']+YaHamzaAbove end else case ch.to_sym when :a then AlifHamzaAbove when :i then YaHamzaAbove when :u then WawHamzaAbove when :ī then YaHamzaAbove end end end |
#hamza_before_following(ch, pch, first_letter_of_word = false) ⇒ Object
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
# File 'lib/precise/transcription_r2a.rb', line 202 def hamza_before_following(ch, pch, first_letter_of_word = false) if first_letter_of_word case ch.to_sym when :a, :u then AlifHamzaAbove when :i then AlifHamzaBelow when :ā then AlifMadda when :ī then "#{YaHamzaAbove}#{R2A[ch]}" when :ū then "#{WawHamzaAbove}#{R2A[ch]}" end else if %w[y ī].include? pch # also take into account what PRECEDED the hamza - that might take precedence! case ch.to_sym when :a then YaHamzaAbove when :i then YaHamzaAbove when :u then WawHamzaAbove when :ī then "#{YaHamzaAbove}#{R2A[ch]}" when :ū then "#{WawHamzaAbove}#{R2A[ch]}" end else case ch.to_sym when :a then AlifHamzaAbove when :i then YaHamzaAbove when :u then pch == 'ū' ? R2A['ʾ'] : WawHamzaAbove when :ī then "#{YaHamzaAbove}#{R2A[ch]}" when :ū then "#{WawHamzaAbove}#{R2A[ch]}" end end end end |
#reverse(romanized) ⇒ Object
input: valid Precise string
example: (al-)ʿAbbādī Muḥammad Ibn Aḥmad Ibn Muḥammad al-Harawī
output: Arabic string
example: العَبّادي مُحَمَّد بن أَحمَد بن مُحَمَّد الهَرَوي
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 |
# File 'lib/precise/transcription_r2a.rb', line 298 def reverse(romanized) raise Precise::NotATranscriptionError if romanized.nil? # sure, it's called "Precise", but it should still be # as tolerant as possible in what it accepts as input... romanized = sanitize(romanized) arabic = '' # we start with an empty string and go character by character puts "- (#{romanized.size}) [#{romanized}]".light_green if $dbg > 1 # next, turn strings into character arrays romanized = romanized.chars arabic = arabic.chars # to be able to merge 2 romanized characters into 1 arabic character skip = false # print string like so: ʿ·A·b·b·ā·d·ī· ·M·u·ḥ·a·m·m·a·d· ·I·b·n· ·A·ḥ·m·a·d· ·I·b·n· ... puts "- (#{romanized.size}) [#{romanized.join('·')}]".light_green if $dbg > 1 # loop over the romanized character array, filling the arabic one up as we go romanized.each_with_index do |ch,i| # a little bit of context pch = i == 0 ? nil : romanized[i-1] fch = romanized[i+1] ffch = romanized[i+2] # multi-letter skip-aheads if skip dbg "\t\tskipping #{ch}" if !(pch=='a' && fch=='-') # we're in the middle of "al-" (word-start) skip=false; end; next; end # symbols to remove from input (dbg "\tskipping unprintable symbol"; next) if [ZWNJ].include?(ch) # deal with alif madda before "normal" hamza rules follow if ("#{ch}#{fch}".match?(/ʾā/) || "#{pch}#{ch}".match?(/^Ā/)) (dbg "\talif madda #{R2A['ʾā']}"; arabic << R2A['ʾā']; skip=true; next); end # hamza followed by a short or long vowel if ch == 'ʾ' && %w[a i u ā ī ū].include?(fch.to_s.downcase) is_first_letter_of_word = (pch.nil? || pch.match(/\s+/)) (dbg "\t#{ch} with following #{fch}"; arabic << hamza_before_following(fch, pch, is_first_letter_of_word); skip=true unless this_word(romanized.join, i).match?(/(a$|at($|\s))/) next); end # hamza preceded by a short vowel # (beware of a possible alif madda (would be dealt with above, on the next round)) if fch.to_s == 'ʾ' && !ffch.to_s.match?(/[āĀ]/) && %w[a i u].include?(ch.downcase) is_first_letter_of_word = (pch.nil? || pch.match(/\s+/)) (dbg "\t#{fch} carried on or following preceding #{ch}" arabic << hamza_after_preceding(ch, is_first_letter_of_word); skip=true; next); end # find the article "al", marked by having a dash appended to it (dbg "\tarticle al- #{R2A['al-']}"; arabic << R2A['al-']; skip=true; next) if ("#{ch}#{fch}#{ffch}" == 'al-') # unconditionally add spaces, dots and dashes to the output (dbg "\tinitial only (#{pch}#{ch})"; arabic << ch; next) if ch=='.' && (fch.nil? || fch.match(/\s+/)) (dbg "\tnon-letter (#{ch})"; arabic << ch; next) if ch.match(PunctSepRgx) # white space or punctuation # a word-initial "a" or "u" must always be preceded by "ʾ"; only "i" can possibly *not* have one # deal with word-initial special cases if pch.to_s.strip.empty? # either beginning of string or of word if %w[a u].include?(ch) (dbg "\tprepending #{ch} with hamza"; arabic << R2A[ch.upcase]; next); end if ch == 'i' (dbg "\thamza-less alif?" context = this_word(romanized.join, i) arabic << alif_for_word_initial_kasra(context.split(/^w?al-/).last) next); end; end # perform tashdeed (out=R2A[ch]+Shadda; dbg "\ttashdeed of #{ch} #{out}"; arabic << out; skip = true; next) if R2A[ch] && ch==fch # should there be a ta'marbouta or not at the end of the word? context1 = this_word(romanized.join,i) context2 = this_word_and_the_next(romanized.join,i) if context1 == context2 # single word if (i == context1.length-2 && "#{ch}#{fch}".match?(/at$/)) \ || (i == context1.length-1 && "#{ch}#{fch}".match?(/a$/)) arabic << R2A['-at']+' '; skip=true; next end else # multiple words if (i == context1.length-2 && "#{ch}#{fch}#{ffch}".match?(/at\s/)) arabic << R2A['-a']+' '; skip = true; next elsif (i == context1.length-1 && "#{ch}#{fch}".match?(/a\s/)) arabic << R2A['-a']+' '; next end end # letter ayn followed by uppercase vowel if ch == 'ʿ' (skip=true; ar=R2A[ch]) if %w[A I U].include?(fch) case fch # ayn+following vowel at beginning of word when 'A' then ar+=Fatha when 'I' then ar+=Kasra when 'U' then ar+=Damma; end; end (dbg "\tayn+vowel #{ch}#{fch} #{ar}"; arabic << ar; next) if ar && ar.size==2 # long "a" at word-end: alif maqsoorah, otherwise normal alif # "e" at word-end: letter hah, otherwise just a fatha if R2A[ch].class == Array choice = (fch.nil? || fch==' ') ? R2A[ch].first : R2A[ch].last (dbg "\tcontextual #{ch} #{choice}"; arabic << choice; next); end # exact match (pure transliteration, no transcription effort required) (dbg "\tfrom table #{ch}→#{R2A[ch]}"; arabic << R2A[ch]; next) if R2A[ch] # no luck yet; might be a regular uppercase letter (dbg "\tuppercased #{ch} #{R2A[ch.downcase]}"; arabic << R2A[ch.downcase]; next) if R2A[ch.downcase] # still no luck; last shot is punctuation (dbg "\tinterpunctuation #{ch}"; arabic << ch; next) if ch.match?(/[[:punct:]]/) # mark unknown characters as such; the philosophy here being that input to # Precise should be pre-processed enough for this to never have to happen… warn "Warning: character '#{ch}' is unknown to Precise and will be substituted by placeholder only".yellow arabic << '�' end # character-array to word-array arabic = arabic.compact.join.split # العأَبّادي محمّد إِبن أَحمد إِبن محمّد للهروي (but with () around "al") puts "- (#{arabic.join(' ').size-1}) [#{L2RM+arabic.join(' ')+L2RM}]".light_green if $dbg > 1 # dragnet replacement of special words, such as changing "ibn" into "bin" 2.times.each_with_index do |i| puts "#{' '*6}(postprocessing round #{i+1})".light_green if $dbg > 1 PostR2AWordReplacements.each{|rgx,subst| arabic.map!{|w| puts "#{' '*8}word match: #{L2RM}#{rgx.inspect} #{L2RM}=> #{L2RM}'#{subst}'".green if (w.match(rgx) && $dbg > 1) w.gsub(/-/, '') # dashes not needed anymore now .gsub(rgx, subst)} } end # some rules apply only in the context of words, not letters puts "- (#{arabic.join(' ').size-1}) [#{L2RM+arabic.join(' ')+L2RM}]".light_green if $dbg > 1 arabic = arabic.join(' ') PostR2AContextReplacements.each{|rgx,subst| puts "#{' '*8}context match: #{L2RM}#{rgx.inspect} #{L2RM}=> #{L2RM}'#{subst}'".green if (arabic.match(rgx) && $dbg > 1) arabic.gsub!(rgx, subst) } return arabic.(@opts) end |
#sanitize(str) ⇒ Object
286 287 288 289 290 291 292 |
# File 'lib/precise/transcription_r2a.rb', line 286 def sanitize(str) # remove nonprintables such as the ZWNJ # FIXME: the erroneous_chars replacement table should have already taken care of this?! ["\u200c", "\u200f"].each{|ch| str.gsub! ch, ''} # make letters following either ʿ or ʾ lowercase lastc=''; str.chars.map{|c| c.downcase! if lastc.match?(/[ʿʾ]/); lastc=c}.join end |
#this_word(str, idx) ⇒ Object
186 187 188 |
# File 'lib/precise/transcription_r2a.rb', line 186 def this_word(str, idx) str[0...idx][/\S*\z/] + (str[idx..-1][/\A[#{TranslitChars}\w]+/] || '') end |
#this_word_and_the_next(str, idx) ⇒ Object
190 191 192 193 194 195 196 197 198 199 200 |
# File 'lib/precise/transcription_r2a.rb', line 190 def this_word_and_the_next(str, idx) # first part: from beginning of string to index position, get all non-whitespace characters # second part: from index position to end of string, # get all characters belonging to the word which the index position character belongs to, # as well as the next word if any if str.match?(/\s+/) str[0...idx][/\S*\z/] + (str[idx..-1][/\A[#{@translit_chars}\w]+\s+[#{@translit_chars}\w]+/i] || '') else str end end |
#transcribe(arabic) ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
# File 'lib/precise/transcription_a2r.rb', line 83 def transcribe(arabic) non_word_rgx = /([\s\d[:punct:]]+)/ in_chunks = arabic.split non_word_rgx in_chunks.each.with_index do |chunk,i| word = chunk (next) if chunk.strip.empty? (@out_chunks << chunk.strip; next) if chunk.match? non_word_rgx chars = chunk.chars skip = 0 (@out_chunks << '') chars.each.with_index do |ch,j| (skip-=1; next) if skip>0 (@out_chunks[-1] << A2R['ال']; skip+=1; next) if j==0 && word.match?(/^ال/) out_char = nil # و and ي: # first in array is a long vowel, # second in array is a consonant if A2R[ch].class==Array if j==0 || j+1==word.length (@out_chunks[-1] << A2R[ch][-1]; next) else out_char = A2R[ch][0] end else out_char = A2R[ch] end (@out_chunks[-1] << A2R[chars[j-1]]) if ch == SHADDA (@out_chunks[-1] << out_char; next) if out_char end end end |
#transcription ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/precise/transcription_a2r.rb', line 6 def transcription @out_chunks .map{|c| c .gsub(/^m$/, 'mīlādī') .gsub(/^h$/, 'hijrī') .gsub(/^wāltī$/, 'wa-l-lātī') .gsub(/^wālḏī$/, 'wa-l-lāḏī') .gsub(/^hy$/, 'hiya') .gsub(/^ʿlá$/, 'ʿalá') .gsub(/^mn$/, 'min') .gsub(/^yd$/, 'yad') .gsub(/^fy$/, 'fī') .gsub(/^lhā$/, 'lahā')} .join(' ') .gsub('؟','?') .gsub('،',',') .gsub(/\s+([[:punct:]]+)/,'\1') .gsub(/(?!(\s+|^))\(\s+/, ' (') end |