Skip to content

Commit

Permalink
第一次提交
Browse files Browse the repository at this point in the history
  • Loading branch information
youyou-sudo committed Sep 4, 2024
0 parents commit a30eabb
Show file tree
Hide file tree
Showing 4 changed files with 270 additions and 0 deletions.
55 changes: 55 additions & 0 deletions .github/workflows/vndb-process.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: Process VNDB Data

on:
push:
branches:
- main

permissions:
contents: write
actions: write

jobs:
vndb-process:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: "3.10"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install zstandard requests
- name: Download and extract VNDB dump
run: |
python script/dlvndb.py
- name: Process JSON data
run: |
python script/vn_data_json.py
- name: Upload to GitHub Release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Define variables
RELEASE_TAG="v1.0.0"
FILE_PATH="vn_data.json" # Replace with your actual file path
FILE_NAME="vn_data.json" # Replace with your actual file name
# Upload the file to the existing release
RESPONSE=$(curl -s -X POST \
-H "Authorization: token $GITHUB_TOKEN" \
-H "Accept: application/vnd.github+json" \
-H "Content-Type: application/zip" \
--data-binary @"$FILE_PATH" \
"https://uploads.github.com/repos/youyou-sudo/vndb-search-dataset/releases/173472428/assets?name=$FILE_NAME")
echo "Response: $RESPONSE"
1 change: 1 addition & 0 deletions gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/script/vndb_data/*
48 changes: 48 additions & 0 deletions script/dlvndb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import tarfile

import requests
import zstandard as zstd


# 下载文件
def download_file(url, output_path):
response = requests.get(url, stream=True)
response.raise_for_status() # 检查请求是否成功

with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"Downloaded {output_path}")


# 解压zst文件
def decompress_zst(input_path, output_path):
with open(input_path, 'rb') as compressed_file:
dctx = zstd.ZstdDecompressor()
with open(output_path, 'wb') as output_file:
dctx.copy_stream(compressed_file, output_file)
print(f"Decompressed to {output_path}")


# 解压tar文件
def extract_tar(tar_path, extract_path):
with tarfile.open(tar_path, 'r') as tar:
tar.extractall(path=extract_path)
print(f"Extracted {tar_path} to {extract_path}")


if __name__ == "__main__":
vndb_url = "https://dl.vndb.org/dump/vndb-db-latest.tar.zst"
tar_zst_path = "vndb-db-latest.tar.zst"
tar_path = "vndb-db-latest.tar"
extract_dir = "./vndb_data"

# 下载文件
download_file(vndb_url, tar_zst_path)

# 解压 zst 文件到 tar 文件
decompress_zst(tar_zst_path, tar_path)

# 解压 tar 文件
extract_tar(tar_path, extract_dir)
166 changes: 166 additions & 0 deletions script/vn_data_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import csv
import json
from collections import defaultdict
from datetime import datetime

# 定义列名
vn_columns = ['id', 'olang', 'image', 'l_wikidata', 'c_votecount', 'c_rating', 'c_average', 'length', 'devstatus',
'alias', 'l_renai', 'description']
vn_titles_columns = ['id', 'lang', 'official', 'title', 'latin']
releases_titles_columns = ['id', 'lang', 'mtl', 'title', 'latin']
releases_vn_columns = ['id', 'vid', 'rtype']

releases_fields = [
'id', 'olang', 'gtin', 'l_toranoana', 'l_appstore', 'l_nintendo_jp', 'l_nintendo_hk', 'released',
'l_steam', 'l_digiket', 'l_melon', 'l_mg', 'l_getchu', 'l_getchudl', 'l_egs', 'l_erotrail',
'l_melonjp', 'l_gamejolt', 'l_animateg', 'l_freem', 'l_novelgam', 'voiced', 'reso_x', 'reso_y',
'minage', 'ani_story', 'ani_ero', 'ani_story_sp', 'ani_story_cg', 'ani_cutscene', 'ani_ero_sp',
'ani_ero_cg', 'ani_bg', 'ani_face', 'has_ero', 'patch', 'freeware', 'uncensored', 'official',
'website', 'catalog', 'engine', 'notes', 'l_dlsite', 'l_gog', 'l_denpa', 'l_jlist', 'l_jastusa',
'l_itch', 'l_nutaku', 'l_googplay', 'l_fakku', 'l_freegame', 'l_playstation_jp', 'l_playstation_na',
'l_playstation_eu', 'l_playstation_hk', 'l_nintendo', 'l_gyutto', 'l_dmm', 'l_booth', 'l_patreonp',
'l_patreon', 'l_substar'
]


def clean_data(data):
"""清理数据中的字段,如果字段值仅为\\N,则删除该字段"""

def clean_entry(entry):
keys_to_remove = [key for key, value in entry.items() if value == '\\N']
for key in keys_to_remove:
del entry[key]

for entry in data:
clean_entry(entry)
if 'titles' in entry:
for title in entry['titles']:
clean_entry(title)
if 'releases' in entry:
for release in entry['releases']:
clean_entry(release)


def tsv_to_json(vn_tsv_file_path, vn_titles_tsv_file_path, releases_titles_tsv_file_path, releases_vn_tsv_file_path,
releases_file_path, json_file_path):
# 使用defaultdict来组织数据
vn_data = defaultdict(
lambda: {"id": None, "titles": [], "image": None, "olang": None, "alias": [], "releases": []})

# 读取vn表的TSV文件
with open(vn_tsv_file_path, newline='', encoding='utf-8') as vn_tsv_file:
reader = csv.DictReader(vn_tsv_file, delimiter='\t', fieldnames=vn_columns)

for row in reader:
entry_id = row['id']
vn_data[entry_id]["id"] = entry_id
vn_data[entry_id]["image"] = row["image"]
vn_data[entry_id]["olang"] = row.get("olang", None)
aliases = row["alias"].replace('\\n', '\n').splitlines()
vn_data[entry_id]["alias"] = [alias.strip() for alias in aliases if alias.strip()]

# 读取vn_titles表的TSV文件
with open(vn_titles_tsv_file_path, newline='', encoding='utf-8') as vn_titles_tsv_file:
reader = csv.DictReader(vn_titles_tsv_file, delimiter='\t', fieldnames=vn_titles_columns)

for row in reader:
entry_id = row['id']
if entry_id in vn_data: # 确保vn_data中存在这个id
vn_data[entry_id]["titles"].append({
"lang": row["lang"],
"official": row["official"],
"title": row["title"],
"latin": row["latin"]
})

# 读取releases_vn表的TSV文件
releases_mapping = defaultdict(list)
with open(releases_vn_tsv_file_path, newline='', encoding='utf-8') as releases_vn_tsv_file:
reader = csv.DictReader(releases_vn_tsv_file, delimiter='\t', fieldnames=releases_vn_columns)

for row in reader:
releases_mapping[row['id']].append({
"vid": row["vid"],
"rtype": row["rtype"]
})

# 读取releases_titles表并合并数据
releases_data = {}
with open(releases_titles_tsv_file_path, newline='', encoding='utf-8') as releases_titles_tsv_file:
reader = csv.DictReader(releases_titles_tsv_file, delimiter='\t', fieldnames=releases_titles_columns)

for row in reader:
entry_id = row['id']
if entry_id not in releases_data:
releases_data[entry_id] = {
"id": row["id"],
"lang": row["lang"],
"mtl": row["mtl"],
"title": row["title"],
"latin": row["latin"],
"releases": []
}
releases_data[entry_id]["releases"].append(releases_mapping[entry_id])

# 读取 releases 表并填充数据
added_releases = set() # 用于跟踪已添加的记录
with open(releases_file_path, newline='', encoding='utf-8') as releases_file:
reader = csv.DictReader(releases_file, delimiter='\t', fieldnames=releases_fields)

for row in reader:
entry_id = row['id']
if entry_id in releases_data:
for release in releases_data[entry_id]["releases"]:
for r in release:
if r["vid"] in vn_data: # 确保 vn_data 中存在这个 vid
# 处理 released 字段
released_str = row["released"]
try:
# 将字符串转换为 datetime 对象
released_date = datetime.strptime(released_str, "%Y%m%d")
# 转换为 ISO 8601 字符串(标准日期格式)
released_iso = released_date.isoformat()
except ValueError:
# 如果解析失败,则使用默认值或跳过
released_iso = None

release_data = {
"rid": row["id"],
"released": released_iso,
"lang": releases_data[entry_id]["lang"],
"mtl": releases_data[entry_id]["mtl"],
"title": releases_data[entry_id]["title"],
"latin": releases_data[entry_id]["latin"],
"rtype": r["rtype"]
}
# 添加字段值时排除值为0的字段
if row.get("l_steam") != "0":
release_data["l_steam"] = row.get("l_steam")
if row.get("l_digiket") != "0":
release_data["l_digiket"] = row.get("l_digiket")
if row.get("l_egs") != "0":
release_data["l_egs"] = row.get("l_egs")
if row.get("l_dlsite") != "0":
release_data["l_dlsite"] = row.get("l_dlsite")

release_key = (release_data["rid"], release_data["lang"]) # 以 rid 和 lang 作为唯一标识
if release_key not in added_releases:
vn_data[r["vid"]]["releases"].append(release_data)
added_releases.add(release_key)

# 清理数据中的字段
transformed_data_list = list(vn_data.values())
clean_data(transformed_data_list)

# 删除没有 image 字段的条目
transformed_data_list = [entry for entry in transformed_data_list if entry.get("image")]

# 将数据转换为JSON格式并写入文件
with open(json_file_path, 'w', encoding='utf-8') as json_file:
json.dump(transformed_data_list, json_file, ensure_ascii=False, indent=4)


# 使用示例
tsv_to_json('./vndb_data/db/vn', './vndb_data/db/vn_titles', './vndb_data/db/releases_titles',
'./vndb_data/db/releases_vn',
'./vndb_data/db/releases', 'vn_data.json')

0 comments on commit a30eabb

Please sign in to comment.