-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.sh
161 lines (144 loc) · 3.67 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Usage: zsh run.sh [DATE [LANGUAGES]]
#
# Automate download and word frequency list generation.
#
# See `wordfrequency.py --help` for options of the tool used in the script.
#
# Based on https://github.com/notani/wikipedia-word-frequency/blob/master/run.sh by Naoki Otani
# We generate several variants for JA and EN:
JA_OPTS="--ja/--ja -D unidic"
JA_SUFFIXES="/-310"
EN_OPTS="--default/--en"
EN_SUFFIXES="/-penn"
DATE=$1
if [ -z "$DATE" ]
then
DATE="20221020"
fi
if [ "$#" -ge 1 ]
then
shift 1
fi
LANGS="$@"
if [ -z "$LANGS" ]
then
LANGS="cs en fr de it ja pt ru es zh id"
fi
for lang in ${=LANGS}
do
if [ -e results/${lang}wiki-frequency-${DATE}.tsv.xz ]; then
continue
fi
# 1. Download
if [ ! -d dumps.wikimedia.org/${lang}wiki/${DATE}/ ]; then
cmd="wget -np -r --accept-regex 'https://dumps.wikimedia.org/${lang}wiki/${DATE}/${lang}wiki-${DATE}-pages-articles[0-9].*' https://dumps.wikimedia.org/${lang}wiki/${DATE}/"
echo $cmd
eval $cmd
if ! (ls dumps.wikimedia.org/${lang}wiki/${DATE} | grep -q 'pages-articles')
then
cmd="wget -np -r --accept-regex 'https://dumps.wikimedia.org/${lang}wiki/${DATE}/${lang}wiki-${DATE}-pages-articles\.xml\.bz2' https://dumps.wikimedia.org/${lang}wiki/${DATE}/"
echo $cmd
eval $cmd
fi
fi
# 2. Count
if [ $lang = "zh" ]
then
opts="--zh"
suffixes=""
elif [ $lang = "ja" ]
then
opts="$JA_OPTS"
suffixes="$JA_SUFFIXES"
elif [ $lang = "en" ]
then
opts="$EN_OPTS"
suffixes="$EN_SUFFIXES"
else
# Explicit --default ensures we do an iteration.
opts="--default"
suffixes=""
fi
for opt in "${(@s:/:)opts}"
do
suffix="${suffixes%%/*}"
suffixes="${suffixes#*/}"
cmd="python word_frequency.py -x $opt dumps.wikimedia.org/${lang}wiki/${DATE}/*.bz2 -o results/${lang}wiki-frequency-${DATE}${suffix}%.tsv.xz"
echo $cmd
eval $cmd
done
done
typeset -A LANGNAMES MUTNAMES SUFNAMES
LANGNAMES=(
[cs]=Czech
[en]=English
[fr]=French
[de]=German
[it]=Italian
[ja]=Japanese
[pt]=Portuguese
[ru]=Russian
[es]=Spanish
[zh]=Chinese
[id]=Indonesian
)
SUFNAMES=(
[]="regex"
[zh]="jieba, <b>experimental</b>"
[ja]="Unidic Lite"
[-310]="Unidic 3.1.0"
[-penn]="Penn"
)
MUTATIONS="/-lower/-nfkc/-nfkc-lower"
MUTNAMES=(
[]="no norm."
[-lower]="no norm., lowercased"
[-nfkc]="NFKC norm."
[-nfkc-lower]="NFKC norm., lowercased"
)
table_fmt='|:------------------- |'
echo -n '| Language / Mutation |'
for mutation in "${(@s:/:)MUTATIONS}"
do
mutname="$MUTNAMES[$mutation]"
dashes="$(echo $mutname | sed 's/./-/g')"
echo -n " ${mutname} |"
table_fmt="${table_fmt} ${dashes}:|"
done
echo " #tokens | #articles |"
echo "$table_fmt -------:| ---------:|"
for lang in ${=LANGS}
do
if [ $lang = "ja" ]
then
suffixes="$JA_SUFFIXES"
elif [ $lang = "en" ]
then
suffixes="$EN_SUFFIXES"
else
suffixes=""
fi
langname="$LANGNAMES[$lang]"
for suffix in "${(@s:/:)suffixes}"
do
if [ -z "$suffix" ] && [ $lang = "ja" -o $lang = "zh" ]
then
sufname="$SUFNAMES[$lang]"
else
sufname="$SUFNAMES[$suffix]"
fi
echo -n "| ${langname}<sub>${sufname}</sub> |"
for mutation in "${(@s:/:)MUTATIONS}"
do
mutname="$MUTNAMES[$mutation]"
file="results/${lang}wiki-frequency-${DATE}${suffix}${mutation}.tsv.xz"
totals=$(xzcat $file | awk 'END{ print NR-2, $2, $3 }')
types="${totals%% *}"
tokens_docs="${totals#* }"
tokens="${tokens_docs% *}"
docs="${tokens_docs#* }"
printf " [%'d]($file) |" "$types"
done
printf " %'d | %'d |\n" "$tokens" "$docs"
done
done