factored out arabic_utilities.py as a separate file

GrammaticalFramework · Sep 25, 2023 · 1c355ce · 1c355ce
1 parent 561a8c1
commit 1c355ce
Show file tree

Hide file tree

Showing 3 changed files with 172 additions and 136 deletions.
diff --git a/src/arabic/wiktionary/arabic_utilities.py b/src/arabic/wiktionary/arabic_utilities.py
@@ -0,0 +1,169 @@
+# utilities for Arabic script
+# in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter
+# as specified by the command line argument:
+#
+#   % python3 arabic_utilities.py to <MorphoDictAra.gf | python3 arabic_utilities.py from >b.tmp
+#   % diff MorphoDictAra.gf b.tmp 
+#   % 
+
+def is_arabic(s):
+    return s and any(1574 <= ord(c) <= 1616 for c in s)
+
+
+def get_arabic(s):
+    return ''.join([c for c in s if is_arabic(c)])
+
+
+def unvocalize(s):
+    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
+
+
+# https://en.wikipedia.org/wiki/Buckwalter_transliteration
+buckwalter_dict = {
+  0x621: "'",  # ء
+  0x622: '|',  # آ
+  0x623: '>',  # أ
+  0x624: '&',  # ؤ
+  0x625: '<',  # إ
+  0x626: '}',  # ئ
+  0x627: 'A',  # ا
+  0x628: 'b',  # ب
+  0x629: 'p',  # ة
+  0x62a: 't',  # ت
+  0x62b: 'v',  # ث
+  0x62c: 'j',  # ج
+  0x62d: 'H',  # ح
+  0x62e: 'x',  # خ
+  0x62f: 'd',  # د
+  0x630: '*',  # ذ
+  0x631: 'r',  # ر
+  0x632: 'z',  # ز
+  0x633: 's',  # س
+  0x634: '$',  # ش
+  0x635: 'S',  # ص
+  0x636: 'D',  # ض
+  0x637: 'T',  # ط
+  0x638: 'Z',  # ظ
+  0x639: 'E',  # ع
+  0x63a: 'g',  # غ
+  0x641: 'f',  # ف  
+  0x642: 'q',  # ق
+  0x643: 'k',  # ك
+  0x644: 'l',  # ل
+  0x645: 'm',  # م
+  0x646: 'n',  # ن
+  0x647: 'h',  # ه
+  0x648: 'w',  # و
+  0x649: 'Y',  # ى
+  0x64a: 'y',  # ي
+  0x64b: 'F',  # ً
+  0x64c: 'N',  # ٌ
+  0x64d: 'K',  # ٍ
+  0x64e: 'a',  # َ
+  0x64f: 'u',  # ُ
+  0x650: 'i',  # ِ
+  0x651: '~',  # ّ
+  0x652: 'o',  # ْ
+  0x670: '`',  # '
+  0x671: '{'   # ٱ
+  }
+
+
+buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
+
+arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
+
+sound_consonants = {chr(c) for c in range(0x628, 0x648)}  # excluding alif, waw, ya
+
+def to_buckwalter(s):
+    return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
+
+
+def from_buckwalter(s):
+    return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
+
+
+def drop_final_vowel(s):
+    if s[-1] in arabic_vowels:
+        return s[:-1]
+    else:
+        return s
+
+
+def normal(s):
+    return unicodedata.normalize('NFD', s)
+
+# heuristic for finding the three radicals from certain forms
+# works only for sound (strong) 3-radical roots, otherwise None
+def get_sound_trigram_root(s):
+    sounds = [c for c in s if c in sound_consonants]
+    if len(sounds) == 3:
+        return ''.join(sounds)
+    else:
+        return None
+
+
+# reverse engineer fcl pattern from a given form, with a sound trigram root
+# one more condition: each of the root letters occurs exactly ones
+# TODO: better use the given root of the lex entry
+def get_sound_fcl_pattern(s):
+    if root := get_sound_trigram_root(s):
+        if len([c in s for c in root]) == 3:
+            p = list(s)
+            r = s.find(root[0])
+            p[r] = chr(0x641)
+            r += s[r+1:].find(root[1]) + 1
+            p[r] = chr(0x639)
+            r += s[r+1:].find(root[2]) + 1
+            p[r] = chr(0x644)
+            p = ''.join(p)
+##            print('---PATT', s, root, p)
+            return p
+
+
+# Wikt uses vowel+shadda which is a Unicode normalization
+# GF uses shadda+vowel which is linguistically correct
+# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
+# unicodedata.normalize does this wrong, as noted by Ariel Gutman 
+## todo: more direct implementation
+def reorder_shadda(s):
+    return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
+
+
+# quote word forms but not parameters
+def quote_if(s, cond=is_arabic, change=reorder_shadda):
+    if cond(s):
+        return '"' + change(s) + '"'
+    else:
+        return s
+
+
+# for a string, change each string literal in "..." with a change function
+# leaving other characters as they are; print the string to stdout as you go
+def change_literals(s, change):
+    inliteral = False
+    literal = ''
+    for c in s:
+        if c == '"' and inliteral:
+            print('"'+change(literal)+'"', end='')
+            inliteral = False
+            literal = ''
+        elif c == '"':
+            inliteral = True
+        elif inliteral:
+            literal += c
+        else:
+            print(c, end='')
+
+
+# convert literals in stdin 'to' or 'from' Buckwalter
+if __name__ == '__main__':
+    import sys
+    mode = sys.argv[1]
+    for line in sys.stdin:
+        if mode == 'from':
+            change_literals(line, from_buckwalter)
+        elif mode == 'to':
+            change_literals(line, to_buckwalter)
+
+
diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
@@ -3,7 +3,7 @@
 import sys
 import unicodedata
 import pgf
-
+from arabic_utilities import *
 
 # data from https://kaikki.org/dictionary/rawdata.html
 # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
@@ -110,132 +110,6 @@ def get_gzip_json(file, sample=100000, langs=[]):
     for labverdict, n in sorted(list(evals.items())):
         print(labverdict, n)
 
-# https://en.wikipedia.org/wiki/Buckwalter_transliteration
-buckwalter_dict = {
-  0x621: "'",  # ء
-  0x622: '|',  # آ
-  0x623: '>',  # أ
-  0x624: '&',  # ؤ
-  0x625: '<',  # إ
-  0x626: '}',  # ئ
-  0x627: 'A',  # ا
-  0x628: 'b',  # ب
-  0x629: 'p',  # ة
-  0x62a: 't',  # ت
-  0x62b: 'v',  # ث
-  0x62c: 'j',  # ج
-  0x62d: 'H',  # ح
-  0x62e: 'x',  # خ
-  0x62f: 'd',  # د
-  0x630: '*',  # ذ
-  0x631: 'r',  # ر
-  0x632: 'z',  # ز
-  0x633: 's',  # س
-  0x634: '$',  # ش
-  0x635: 'S',  # ص
-  0x636: 'D',  # ض
-  0x637: 'T',  # ط
-  0x638: 'Z',  # ظ
-  0x639: 'E',  # ع
-  0x63a: 'g',  # غ
-  0x641: 'f',  # ف  
-  0x642: 'q',  # ق
-  0x643: 'k',  # ك
-  0x644: 'l',  # ل
-  0x645: 'm',  # م
-  0x646: 'n',  # ن
-  0x647: 'h',  # ه
-  0x648: 'w',  # و
-  0x649: 'Y',  # ى
-  0x64a: 'y',  # ي
-  0x64b: 'F',  # ً
-  0x64c: 'N',  # ٌ
-  0x64d: 'K',  # ٍ
-  0x64e: 'a',  # َ
-  0x64f: 'u',  # ُ
-  0x650: 'i',  # ِ
-  0x651: '~',  # ّ
-  0x652: 'o',  # ْ
-  0x670: '`',  # '
-  0x671: '{'   # ٱ
-  }
-
-
-buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
-
-arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
-
-sound_consonants = {chr(c) for c in range(0x628, 0x648)}  # excluding alif, waw, ya
-
-def to_buckwalter(s):
-    return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
-
-
-def from_buckwalter(s):
-    return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
-
-
-def unvocalize(s):
-    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
-
-
-def drop_final_vowel(s):
-    if s[-1] in arabic_vowels:
-        return s[:-1]
-    else:
-        return s
-
-
-def is_arabic(s):
-    return s and any(1574 <= ord(c) <= 1616 for c in s)
-
-def normal(s):
-    return unicodedata.normalize('NFD', s)
-
-# heuristic for finding the three radicals from certain forms
-# works only for sound (strong) 3-radical roots, otherwise None
-def get_sound_trigram_root(s):
-    sounds = [c for c in s if c in sound_consonants]
-    if len(sounds) == 3:
-        return ''.join(sounds)
-    else:
-        return None
-
-
-# reverse engineer fcl pattern from a given form, with a sound trigram root
-# one more condition: each of the root letters occurs exactly ones
-# TODO: better use the given root of the lex entry
-def get_sound_fcl_pattern(s):
-    if root := get_sound_trigram_root(s):
-        if len([c in s for c in root]) == 3:
-            p = list(s)
-            r = s.find(root[0])
-            p[r] = chr(0x641)
-            r += s[r+1:].find(root[1]) + 1
-            p[r] = chr(0x639)
-            r += s[r+1:].find(root[2]) + 1
-            p[r] = chr(0x644)
-            p = ''.join(p)
-##            print('---PATT', s, root, p)
-            return p
-
-
-# Wikt uses vowel+shadda which is a Unicode normalization
-# GF uses shadda+vowel which is linguistically correct
-# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
-# unicodedata.normalize does this wrong, as noted by Ariel Gutman 
-## todo: more direct implementation
-def reorder_shadda(s):
-    return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
-
-
-# quote word forms but not parameters
-def quote_if(s, cond=is_arabic, change=reorder_shadda):
-    if cond(s):
-        return '"' + change(s) + '"'
-    else:
-        return s
-
 
 # generate word_d_C functions starting with d=0, but show d only when >= 1
 def gf_fun(s, pos, disamb=0):

diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py
@@ -1,6 +1,8 @@
 import csv
 import json
 
+from arabic_utilities import *
+
 # to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
 # the following are assumed
 
@@ -12,15 +14,6 @@
 # built as explained in ./read_wiktionary.py
 MORPHO_GF = 'MorphoDictAraAbs.gf'
 
-def is_arabic(s):
-    return s and any(1574 <= ord(c) <= 1616 for c in s)
-
-def get_arabic(s):
-    return ''.join([c for c in s if is_arabic(c)])
-
-def unvocalize(s):
-    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
-
 
 # fun 'دُبُ_N' : N ; -- 10 [['bear']]
 funmap = {}