Skip to content

Commit

Permalink
factored out arabic_utilities.py as a separate file
Browse files Browse the repository at this point in the history
  • Loading branch information
aarneranta committed Sep 25, 2023
1 parent 561a8c1 commit 1c355ce
Show file tree
Hide file tree
Showing 3 changed files with 172 additions and 136 deletions.
169 changes: 169 additions & 0 deletions src/arabic/wiktionary/arabic_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# utilities for Arabic script
# in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter
# as specified by the command line argument:
#
# % python3 arabic_utilities.py to <MorphoDictAra.gf | python3 arabic_utilities.py from >b.tmp
# % diff MorphoDictAra.gf b.tmp
# %

def is_arabic(s):
return s and any(1574 <= ord(c) <= 1616 for c in s)


def get_arabic(s):
return ''.join([c for c in s if is_arabic(c)])


def unvocalize(s):
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])


# https://en.wikipedia.org/wiki/Buckwalter_transliteration
buckwalter_dict = {
0x621: "'", # ء
0x622: '|', # آ
0x623: '>', # أ
0x624: '&', # ؤ
0x625: '<', # إ
0x626: '}', # ئ
0x627: 'A', # ا
0x628: 'b', # ب
0x629: 'p', # ة
0x62a: 't', # ت
0x62b: 'v', # ث
0x62c: 'j', # ج
0x62d: 'H', # ح
0x62e: 'x', # خ
0x62f: 'd', # د
0x630: '*', # ذ
0x631: 'r', # ر
0x632: 'z', # ز
0x633: 's', # س
0x634: '$', # ش
0x635: 'S', # ص
0x636: 'D', # ض
0x637: 'T', # ط
0x638: 'Z', # ظ
0x639: 'E', # ع
0x63a: 'g', # غ
0x641: 'f', # ف
0x642: 'q', # ق
0x643: 'k', # ك
0x644: 'l', # ل
0x645: 'm', # م
0x646: 'n', # ن
0x647: 'h', # ه
0x648: 'w', # و
0x649: 'Y', # ى
0x64a: 'y', # ي
0x64b: 'F', # ً
0x64c: 'N', # ٌ
0x64d: 'K', # ٍ
0x64e: 'a', # َ
0x64f: 'u', # ُ
0x650: 'i', # ِ
0x651: '~', # ّ
0x652: 'o', # ْ
0x670: '`', # '
0x671: '{' # ٱ
}


buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}

arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}

sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya

def to_buckwalter(s):
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])


def from_buckwalter(s):
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])


def drop_final_vowel(s):
if s[-1] in arabic_vowels:
return s[:-1]
else:
return s


def normal(s):
return unicodedata.normalize('NFD', s)

# heuristic for finding the three radicals from certain forms
# works only for sound (strong) 3-radical roots, otherwise None
def get_sound_trigram_root(s):
sounds = [c for c in s if c in sound_consonants]
if len(sounds) == 3:
return ''.join(sounds)
else:
return None


# reverse engineer fcl pattern from a given form, with a sound trigram root
# one more condition: each of the root letters occurs exactly ones
# TODO: better use the given root of the lex entry
def get_sound_fcl_pattern(s):
if root := get_sound_trigram_root(s):
if len([c in s for c in root]) == 3:
p = list(s)
r = s.find(root[0])
p[r] = chr(0x641)
r += s[r+1:].find(root[1]) + 1
p[r] = chr(0x639)
r += s[r+1:].find(root[2]) + 1
p[r] = chr(0x644)
p = ''.join(p)
## print('---PATT', s, root, p)
return p


# Wikt uses vowel+shadda which is a Unicode normalization
# GF uses shadda+vowel which is linguistically correct
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
## todo: more direct implementation
def reorder_shadda(s):
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))


# quote word forms but not parameters
def quote_if(s, cond=is_arabic, change=reorder_shadda):
if cond(s):
return '"' + change(s) + '"'
else:
return s


# for a string, change each string literal in "..." with a change function
# leaving other characters as they are; print the string to stdout as you go
def change_literals(s, change):
inliteral = False
literal = ''
for c in s:
if c == '"' and inliteral:
print('"'+change(literal)+'"', end='')
inliteral = False
literal = ''
elif c == '"':
inliteral = True
elif inliteral:
literal += c
else:
print(c, end='')


# convert literals in stdin 'to' or 'from' Buckwalter
if __name__ == '__main__':
import sys
mode = sys.argv[1]
for line in sys.stdin:
if mode == 'from':
change_literals(line, from_buckwalter)
elif mode == 'to':
change_literals(line, to_buckwalter)


128 changes: 1 addition & 127 deletions src/arabic/wiktionary/read_wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
import unicodedata
import pgf

from arabic_utilities import *

# data from https://kaikki.org/dictionary/rawdata.html
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
Expand Down Expand Up @@ -110,132 +110,6 @@ def get_gzip_json(file, sample=100000, langs=[]):
for labverdict, n in sorted(list(evals.items())):
print(labverdict, n)

# https://en.wikipedia.org/wiki/Buckwalter_transliteration
buckwalter_dict = {
0x621: "'", # ء
0x622: '|', # آ
0x623: '>', # أ
0x624: '&', # ؤ
0x625: '<', # إ
0x626: '}', # ئ
0x627: 'A', # ا
0x628: 'b', # ب
0x629: 'p', # ة
0x62a: 't', # ت
0x62b: 'v', # ث
0x62c: 'j', # ج
0x62d: 'H', # ح
0x62e: 'x', # خ
0x62f: 'd', # د
0x630: '*', # ذ
0x631: 'r', # ر
0x632: 'z', # ز
0x633: 's', # س
0x634: '$', # ش
0x635: 'S', # ص
0x636: 'D', # ض
0x637: 'T', # ط
0x638: 'Z', # ظ
0x639: 'E', # ع
0x63a: 'g', # غ
0x641: 'f', # ف
0x642: 'q', # ق
0x643: 'k', # ك
0x644: 'l', # ل
0x645: 'm', # م
0x646: 'n', # ن
0x647: 'h', # ه
0x648: 'w', # و
0x649: 'Y', # ى
0x64a: 'y', # ي
0x64b: 'F', # ً
0x64c: 'N', # ٌ
0x64d: 'K', # ٍ
0x64e: 'a', # َ
0x64f: 'u', # ُ
0x650: 'i', # ِ
0x651: '~', # ّ
0x652: 'o', # ْ
0x670: '`', # '
0x671: '{' # ٱ
}


buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}

arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}

sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya

def to_buckwalter(s):
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])


def from_buckwalter(s):
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])


def unvocalize(s):
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])


def drop_final_vowel(s):
if s[-1] in arabic_vowels:
return s[:-1]
else:
return s


def is_arabic(s):
return s and any(1574 <= ord(c) <= 1616 for c in s)

def normal(s):
return unicodedata.normalize('NFD', s)

# heuristic for finding the three radicals from certain forms
# works only for sound (strong) 3-radical roots, otherwise None
def get_sound_trigram_root(s):
sounds = [c for c in s if c in sound_consonants]
if len(sounds) == 3:
return ''.join(sounds)
else:
return None


# reverse engineer fcl pattern from a given form, with a sound trigram root
# one more condition: each of the root letters occurs exactly ones
# TODO: better use the given root of the lex entry
def get_sound_fcl_pattern(s):
if root := get_sound_trigram_root(s):
if len([c in s for c in root]) == 3:
p = list(s)
r = s.find(root[0])
p[r] = chr(0x641)
r += s[r+1:].find(root[1]) + 1
p[r] = chr(0x639)
r += s[r+1:].find(root[2]) + 1
p[r] = chr(0x644)
p = ''.join(p)
## print('---PATT', s, root, p)
return p


# Wikt uses vowel+shadda which is a Unicode normalization
# GF uses shadda+vowel which is linguistically correct
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
## todo: more direct implementation
def reorder_shadda(s):
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))


# quote word forms but not parameters
def quote_if(s, cond=is_arabic, change=reorder_shadda):
if cond(s):
return '"' + change(s) + '"'
else:
return s


# generate word_d_C functions starting with d=0, but show d only when >= 1
def gf_fun(s, pos, disamb=0):
Expand Down
11 changes: 2 additions & 9 deletions src/arabic/wiktionary/to_wordnet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import csv
import json

from arabic_utilities import *

# to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
# the following are assumed

Expand All @@ -12,15 +14,6 @@
# built as explained in ./read_wiktionary.py
MORPHO_GF = 'MorphoDictAraAbs.gf'

def is_arabic(s):
return s and any(1574 <= ord(c) <= 1616 for c in s)

def get_arabic(s):
return ''.join([c for c in s if is_arabic(c)])

def unvocalize(s):
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])


# fun 'دُبُ_N' : N ; -- 10 [['bear']]
funmap = {}
Expand Down

0 comments on commit 1c355ce

Please sign in to comment.