-
Notifications
You must be signed in to change notification settings - Fork 51
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
factored out arabic_utilities.py as a separate file
- Loading branch information
1 parent
561a8c1
commit 1c355ce
Showing
3 changed files
with
172 additions
and
136 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
# utilities for Arabic script | ||
# in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter | ||
# as specified by the command line argument: | ||
# | ||
# % python3 arabic_utilities.py to <MorphoDictAra.gf | python3 arabic_utilities.py from >b.tmp | ||
# % diff MorphoDictAra.gf b.tmp | ||
# % | ||
|
||
def is_arabic(s): | ||
return s and any(1574 <= ord(c) <= 1616 for c in s) | ||
|
||
|
||
def get_arabic(s): | ||
return ''.join([c for c in s if is_arabic(c)]) | ||
|
||
|
||
def unvocalize(s): | ||
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) | ||
|
||
|
||
# https://en.wikipedia.org/wiki/Buckwalter_transliteration | ||
buckwalter_dict = { | ||
0x621: "'", # ء | ||
0x622: '|', # آ | ||
0x623: '>', # أ | ||
0x624: '&', # ؤ | ||
0x625: '<', # إ | ||
0x626: '}', # ئ | ||
0x627: 'A', # ا | ||
0x628: 'b', # ب | ||
0x629: 'p', # ة | ||
0x62a: 't', # ت | ||
0x62b: 'v', # ث | ||
0x62c: 'j', # ج | ||
0x62d: 'H', # ح | ||
0x62e: 'x', # خ | ||
0x62f: 'd', # د | ||
0x630: '*', # ذ | ||
0x631: 'r', # ر | ||
0x632: 'z', # ز | ||
0x633: 's', # س | ||
0x634: '$', # ش | ||
0x635: 'S', # ص | ||
0x636: 'D', # ض | ||
0x637: 'T', # ط | ||
0x638: 'Z', # ظ | ||
0x639: 'E', # ع | ||
0x63a: 'g', # غ | ||
0x641: 'f', # ف | ||
0x642: 'q', # ق | ||
0x643: 'k', # ك | ||
0x644: 'l', # ل | ||
0x645: 'm', # م | ||
0x646: 'n', # ن | ||
0x647: 'h', # ه | ||
0x648: 'w', # و | ||
0x649: 'Y', # ى | ||
0x64a: 'y', # ي | ||
0x64b: 'F', # ً | ||
0x64c: 'N', # ٌ | ||
0x64d: 'K', # ٍ | ||
0x64e: 'a', # َ | ||
0x64f: 'u', # ُ | ||
0x650: 'i', # ِ | ||
0x651: '~', # ّ | ||
0x652: 'o', # ْ | ||
0x670: '`', # ' | ||
0x671: '{' # ٱ | ||
} | ||
|
||
|
||
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()} | ||
|
||
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}} | ||
|
||
sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya | ||
|
||
def to_buckwalter(s): | ||
return ''.join([buckwalter_dict.get(ord(c), c) for c in s]) | ||
|
||
|
||
def from_buckwalter(s): | ||
return ''.join([buckwalter_dict_rev.get(c, c) for c in s]) | ||
|
||
|
||
def drop_final_vowel(s): | ||
if s[-1] in arabic_vowels: | ||
return s[:-1] | ||
else: | ||
return s | ||
|
||
|
||
def normal(s): | ||
return unicodedata.normalize('NFD', s) | ||
|
||
# heuristic for finding the three radicals from certain forms | ||
# works only for sound (strong) 3-radical roots, otherwise None | ||
def get_sound_trigram_root(s): | ||
sounds = [c for c in s if c in sound_consonants] | ||
if len(sounds) == 3: | ||
return ''.join(sounds) | ||
else: | ||
return None | ||
|
||
|
||
# reverse engineer fcl pattern from a given form, with a sound trigram root | ||
# one more condition: each of the root letters occurs exactly ones | ||
# TODO: better use the given root of the lex entry | ||
def get_sound_fcl_pattern(s): | ||
if root := get_sound_trigram_root(s): | ||
if len([c in s for c in root]) == 3: | ||
p = list(s) | ||
r = s.find(root[0]) | ||
p[r] = chr(0x641) | ||
r += s[r+1:].find(root[1]) + 1 | ||
p[r] = chr(0x639) | ||
r += s[r+1:].find(root[2]) + 1 | ||
p[r] = chr(0x644) | ||
p = ''.join(p) | ||
## print('---PATT', s, root, p) | ||
return p | ||
|
||
|
||
# Wikt uses vowel+shadda which is a Unicode normalization | ||
# GF uses shadda+vowel which is linguistically correct | ||
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra | ||
# unicodedata.normalize does this wrong, as noted by Ariel Gutman | ||
## todo: more direct implementation | ||
def reorder_shadda(s): | ||
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i')) | ||
|
||
|
||
# quote word forms but not parameters | ||
def quote_if(s, cond=is_arabic, change=reorder_shadda): | ||
if cond(s): | ||
return '"' + change(s) + '"' | ||
else: | ||
return s | ||
|
||
|
||
# for a string, change each string literal in "..." with a change function | ||
# leaving other characters as they are; print the string to stdout as you go | ||
def change_literals(s, change): | ||
inliteral = False | ||
literal = '' | ||
for c in s: | ||
if c == '"' and inliteral: | ||
print('"'+change(literal)+'"', end='') | ||
inliteral = False | ||
literal = '' | ||
elif c == '"': | ||
inliteral = True | ||
elif inliteral: | ||
literal += c | ||
else: | ||
print(c, end='') | ||
|
||
|
||
# convert literals in stdin 'to' or 'from' Buckwalter | ||
if __name__ == '__main__': | ||
import sys | ||
mode = sys.argv[1] | ||
for line in sys.stdin: | ||
if mode == 'from': | ||
change_literals(line, from_buckwalter) | ||
elif mode == 'to': | ||
change_literals(line, to_buckwalter) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters