-
Notifications
You must be signed in to change notification settings - Fork 0
/
update
executable file
·140 lines (124 loc) · 3.46 KB
/
update
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env node
import fs from 'fs';
import axios from 'axios';
import base64 from 'base-64';
import utf8 from 'utf8';
import et from 'elementtree';
import corpora from './corpora.json' with {type: 'json'};
const repoUrlRx = /https:\/\/github.com(\/[-.a-zA-Z]+\/[-.a-zA-Z]+)/i;
const token = process.env['GITHUB_API_TOKEN'] || null;
if (token) {
console.log('using $GITHUB_API_TOKEN');
axios.defaults.headers.common['Authorization'] = `Bearer ${token}`;
}
function getCorpusXmlUrl(repoUrl) {
const m = repoUrl.match(repoUrlRx);
if (!m) {
console.warn(`Cannot construct URL to corpus.xml for ${repoUrl}`);
return undefined;
}
return `https://api.github.com/repos${m[1]}/contents/corpus.xml`;
}
async function getCorpusXml(repo) {
const url = getCorpusXmlUrl(repo);
if (!url) {
return;
}
let xml;
try {
const response = await axios.get(url);
xml = utf8.decode(base64.decode(response.data.content));
} catch (error) {
console.warn(`Failed to access ${url}. Status: ${error.response.status}`);
}
return xml;
}
function normalizeSpace(text) {
return text.replace(/(^\s+|\s+$)/g, '').replace(/\s+/g, ' ');
}
function transform(elems) {
let text = '';
elems.forEach((e) => {
if (e.tag === 'ref') {
text += '[';
}
text += e.text || '';
if (e._children) {
text += transform(e._children);
}
if (e.tag === 'ref') {
text += `](${e.attrib.target})`;
}
text += e.tail || '';
});
return text;
}
function getStatus(etree) {
// first check for revisionDesc/@status
const revisionDesc = etree.find('.//revisionDesc[@status]');
if (revisionDesc) {
return revisionDesc.attrib.status;
}
// otherwise check for revisionDesc/change/@status
const changes = etree
.findall('.//revisionDesc/change[@status]')
.sort((a, b) => {
// sorting by 'when' attribute in descending order
if (a.attrib.when > b.attrib.when) {
return -1;
}
if (a.attrib.when < b.attrib.when) {
return 1;
}
return 0;
});
if (changes.length > 0) {
return changes[0].attrib.status;
}
return null;
}
function getLicense(etree) {
const license = etree.find('.//availability/licence');
if (!license) {
return null;
}
const name = normalizeSpace(transform([license]));
const licObj = {name};
if (license.attrib.target) {
licObj.url = license.attrib.target;
}
return licObj;
}
async function getCorpusInfo(repo) {
const xml = await getCorpusXml(repo);
if (!xml) return {};
const corpusInfo = et.parse(xml);
const projectDesc = corpusInfo.findall('.//encodingDesc/projectDesc');
const description = normalizeSpace(transform(projectDesc));
const title = corpusInfo.findtext('.//titleStmt/title');
const status = getStatus(corpusInfo);
const license = getLicense(corpusInfo);
const info = {title, description};
if (status) {
info.status = status;
}
if (license) {
info.license = license;
}
return info;
}
async function update() {
// const info = await getCorpusInfo(corpora[22].repository)
// console.log(info);
const updatedCorpora = [];
for (const corpus of corpora) {
console.log(corpus.repository);
const info = await getCorpusInfo(corpus.repository);
console.log({info});
updatedCorpora.push({...corpus, ...info});
}
const json = JSON.stringify(updatedCorpora, null, 2) + '\n';
fs.writeFileSync('./corpora.json', json);
fs.writeFileSync('./corpora.js', `export default ${json}`);
}
update();