Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

重构 CLI #4

Merged
merged 6 commits into from
Aug 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions .github/workflows/python-package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@ jobs:
- name: run biliarchiver tools
run: |
touch biliarchiver.home
python -m biliarchiver.cli_tools.biliarchiver -h
python -m biliarchiver.cli_tools.bili_archive_bvids -h
python -m biliarchiver.cli_tools.bili_get_bvids -h
python -m biliarchiver.cli_tools.bili_upload -h
python -m biliarchiver.cli_tools.biliarchiver
# - name: Test with pytest
# run: |
# pytest
6 changes: 3 additions & 3 deletions biliarchiver/_biliarchiver_upload_bvid.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
md = xml_chars_legalize(obj=md)
assert isinstance(md, dict)
if hash(json.dumps(md)) != _md_before:
print(f"Removed XML illegal characters from metadata, cleaned metadata:")
print("Removed XML illegal characters from metadata, cleaned metadata:")
print(md)

if filedict:
Expand Down Expand Up @@ -212,15 +212,15 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
if item.metadata.get("external-identifier") != md['external-identifier']:
new_md["external-identifier"] = md['external-identifier']
if new_md:
print(f"Updating metadata:")
print("Updating metadata:")
print(new_md)

# remove XML illegal characters
_md_before = hash(json.dumps(new_md))
new_md = xml_chars_legalize(obj=new_md)
assert isinstance(new_md, dict)
if hash(json.dumps(new_md)) != _md_before:
print(f"Removed XML illegal characters from metadata, cleaned metadata:")
print("Removed XML illegal characters from metadata, cleaned metadata:")
print(new_md)

r = item.modify_metadata(
Expand Down
2 changes: 1 addition & 1 deletion biliarchiver/archive_bvid.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ async def new_get_subtitle_info(client: httpx.AsyncClient, bvid, cid):
res = await req_retry(client, 'https://api.bilibili.com/x/player/v2', params=params)
info = json.loads(res.text)
if info['code'] == -400:
raise APIError(f'未找到字幕信息', params)
raise APIError('未找到字幕信息', params)

# 这里 monkey patch 一下把返回 lan_doc 改成返回 lan,这样生成的字幕文件名就是 语言代码 而不是 中文名 了
# 例如
Expand Down
197 changes: 93 additions & 104 deletions biliarchiver/cli_tools/bili_archive_bvids.py
Original file line number Diff line number Diff line change
@@ -1,111 +1,94 @@
import asyncio
from io import TextIOWrapper
import os
import argparse
from pathlib import Path
from typing import List, Optional, Union
from typing import List, Union

from biliarchiver.archive_bvid import archive_bvid
from biliarchiver.config import config

from bilix.sites.bilibili.downloader import DownloaderBilibili
from rich.console import Console
from httpx import AsyncClient, Client, TransportError
from rich.traceback import install
from biliarchiver.utils.http_patch import HttpOnlyCookie_Handler
from biliarchiver.utils.version_check import check_outdated_version
from biliarchiver.utils.storage import get_free_space
from biliarchiver.version import BILI_ARCHIVER_VERSION
install()

from biliarchiver.config import BILIBILI_IDENTIFIER_PERFIX
from biliarchiver.utils.identifier import human_readable_upper_part_map
from biliarchiver.utils.ffmpeg import check_ffmpeg

from biliarchiver.config import BILIBILI_IDENTIFIER_PERFIX

from dataclasses import dataclass

@dataclass
class Args:
bvids: str
skip_ia: bool
from_browser: Optional[str]
min_free_space_gb: int
skip_to: int = 0

def parse_args():

parser = argparse.ArgumentParser()
parser.add_argument('--bvids', dest='bvids', type=str, help='bvids 列表的文件路径', required=True)
parser.add_argument('-s', '--skip-ia-check', dest='skip_ia', action='store_true',
help='不检查 IA 上是否已存在对应 BVID 的 item ,直接开始下载')
parser.add_argument('--fb', '--from-browser', dest='from_browser', type=str, help='从指定浏览器导入 cookies (否则导入 config.json 中的 cookies_file) [default: None]', default=None)
parser.add_argument('--min-free-space-gb', dest='min_free_space_gb', type=int, help='最小剩余空间 (GB),用超退出 [default: 10]', default=10)
parser.add_argument('--skip-to', dest='skip_to', type=int, help='跳过前 skip_to 个 bvid [default: 0]', default=0)

args = Args(**vars(parser.parse_args()))
install()

return args

def check_ia_item_exist(client: Client, identifier: str) -> bool:
cache_dir = config.storage_home_dir / 'ia_item_exist_cache'
cache_dir = config.storage_home_dir / "ia_item_exist_cache"
# check_ia_item_exist_from_cache_file:
if (cache_dir / f'{identifier}.mark').exists():
if (cache_dir / f"{identifier}.mark").exists():
# print('from cached .mark')
return True

def create_item_exist_cache_file(identifier: str) -> Path:
with open(cache_dir / f'{identifier}.mark', 'w', encoding='utf-8') as f:
f.write('')
return cache_dir / f'{identifier}.mark'
with open(cache_dir / f"{identifier}.mark", "w", encoding="utf-8") as f:
f.write("")
return cache_dir / f"{identifier}.mark"

params = {
'identifier': identifier,
'output': 'json',
"identifier": identifier,
"output": "json",
}
# check_identifier.php API 响应快
r = None
for _ in range(3):
try:
r = client.get('https://archive.org/services/check_identifier.php', params=params)
r = client.get(
"https://archive.org/services/check_identifier.php", params=params
)
break
except TransportError as e:
print(e, 'retrying...')
print(e, "retrying...")
assert r is not None
r.raise_for_status()
r_json = r.json()
assert r_json['type'] =='success'
if r_json['code'] == 'available':
assert r_json["type"] == "success"
if r_json["code"] == "available":
return False
elif r_json['code'] == 'not_available': # exists
elif r_json["code"] == "not_available": # exists
cache_dir.mkdir(parents=True, exist_ok=True)
create_item_exist_cache_file(identifier)
return True
else:
raise ValueError(f'Unexpected code: {r_json["code"]}')


def _main():
args = parse_args()
assert check_ffmpeg() is True, 'ffmpeg 未安装'
def _down(
bvids: TextIOWrapper,
skip_ia_check: bool,
from_browser: str | None,
min_free_space_gb: int,
skip: int,
):
assert check_ffmpeg() is True, "ffmpeg 未安装"

assert args.bvids is not None, '必须指定 bvids 列表的文件路径'
with open(args.bvids, 'r', encoding='utf-8') as f:
bvids_from_file = f.read().splitlines()
bvids_from_file = bvids.read().splitlines()

check_outdated_version(pypi_project='biliarchiver', self_version=BILI_ARCHIVER_VERSION)
check_outdated_version(
pypi_project="biliarchiver", self_version=BILI_ARCHIVER_VERSION
)

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

d = DownloaderBilibili(hierarchy=True, sess_data=None, # sess_data 将在后面装载 cookies 时装载 # type: ignore
d = DownloaderBilibili(
hierarchy=True,
sess_data=None, # sess_data 将在后面装载 cookies 时装载 # type: ignore
video_concurrency=config.video_concurrency,
part_concurrency=config.part_concurrency,
stream_retry=config.stream_retry,
)

# load cookies
if args.from_browser is not None:
update_cookies_from_browser(d.client, args.from_browser)
if from_browser is not None:
update_cookies_from_browser(d.client, from_browser)
else:
update_cookies_from_file(d.client, config.cookies_file)
client = Client(cookies=d.client.cookies, headers=d.client.headers)
Expand All @@ -114,69 +97,84 @@ def _main():
return

def check_free_space():
if args.min_free_space_gb != 0:
if get_free_space(path=config.storage_home_dir) // 1024 // 1024 // 1024 <= args.min_free_space_gb:
return False # not pass
return True # pass
if min_free_space_gb != 0:
if (
get_free_space(
path=config.storage_home_dir) // 1024 // 1024 // 1024
<= min_free_space_gb
):
return False # not pass
return True # pass

d.progress.start()
sem = asyncio.Semaphore(config.video_concurrency)
tasks: List[asyncio.Task] = []

def tasks_check():
for task in tasks:
if task.done():
_task_exception = task.exception()
if isinstance(_task_exception, BaseException):
print(f'任务 {task} 出错,即将异常退出...')
print(f"任务 {task} 出错,即将异常退出...")
for task in tasks:
task.cancel()
raise _task_exception
# print(f'任务 {task} 已完成')
tasks.remove(task)
if not check_free_space():
print(f'剩余空间不足 {args.min_free_space_gb} GiB')
print(f"剩余空间不足 {min_free_space_gb} GiB")
for task in tasks:
task.cancel()
raise RuntimeError(f'剩余空间不足 {args.min_free_space_gb} GiB')
raise RuntimeError(f"剩余空间不足 {min_free_space_gb} GiB")

for index, bvid in enumerate(bvids_from_file):
if index < args.skip_to:
print(f'跳过 {bvid} ({index+1}/{len(bvids_from_file)})', end='\r')
if index < skip:
print(f"跳过 {bvid} ({index+1}/{len(bvids_from_file)})", end="\r")
continue
tasks_check()
if not args.skip_ia:
upper_part = human_readable_upper_part_map(string=bvid, backward=True)
remote_identifier = f'{BILIBILI_IDENTIFIER_PERFIX}-{bvid}_p1-{upper_part}'
if not skip:
upper_part = human_readable_upper_part_map(
string=bvid, backward=True)
remote_identifier = f"{BILIBILI_IDENTIFIER_PERFIX}-{bvid}_p1-{upper_part}"
if check_ia_item_exist(client, remote_identifier):
print(f'IA 上已存在 {remote_identifier} ,跳过')
print(f"IA 上已存在 {remote_identifier} ,跳过")
continue


upper_part = human_readable_upper_part_map(string=bvid, backward=True)
videos_basepath: Path = config.storage_home_dir / 'videos' / f'{bvid}-{upper_part}'
if os.path.exists(videos_basepath / '_all_downloaded.mark'):
print(f'{bvid} 所有分p都已下载过了')
videos_basepath: Path = (
config.storage_home_dir / "videos" / f"{bvid}-{upper_part}"
)
if os.path.exists(videos_basepath / "_all_downloaded.mark"):
print(f"{bvid} 所有分p都已下载过了")
continue

if len(tasks) >= config.video_concurrency:
loop.run_until_complete(asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED))
loop.run_until_complete(
asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
)
tasks_check()

print(f'=== {bvid} ({index+1}/{len(bvids_from_file)}) ===')
print(f"=== {bvid} ({index+1}/{len(bvids_from_file)}) ===")

task = loop.create_task(archive_bvid(d, bvid, logined=logined, semaphore=sem), name=f'archive_bvid({bvid})')
task = loop.create_task(
archive_bvid(d, bvid, logined=logined, semaphore=sem),
name=f"archive_bvid({bvid})",
)
tasks.append(task)

while len(tasks) > 0:
loop.run_until_complete(asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED))
loop.run_until_complete(
asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
)
tasks_check()

print("DONE")


def update_cookies_from_browser(client: AsyncClient, browser: str):
try:
import browser_cookie3

f = getattr(browser_cookie3, browser.lower())
cookies_to_update = f(domain_name="bilibili.com")
client.cookies.update(cookies_to_update)
Expand All @@ -191,59 +189,50 @@ def update_cookies_from_file(client: AsyncClient, cookies_path: Union[str, Path]
elif isinstance(cookies_path, str):
cookies_path = Path(cookies_path).expanduser()
else:
raise TypeError(f'cookies_path: {type(cookies_path)}')
raise TypeError(f"cookies_path: {type(cookies_path)}")

assert os.path.exists(cookies_path), f'cookies 文件不存在: {cookies_path}'
assert os.path.exists(cookies_path), f"cookies 文件不存在: {cookies_path}"

from http.cookiejar import MozillaCookieJar

cj = MozillaCookieJar()

with HttpOnlyCookie_Handler(cookies_path):
cj.load(f'{cookies_path}', ignore_discard=True, ignore_expires=True)
cj.load(f"{cookies_path}", ignore_discard=True, ignore_expires=True)
loadded_cookies = 0
loadded_keys = []
for cookie in cj:
# only load bilibili cookies
if not 'bilibili.com' in cookie.domain:
if "bilibili.com" not in cookie.domain:
continue
if cookie.name in loadded_keys:
print(f'跳过重复的 cookies: {cookie.name}')
print(f"跳过重复的 cookies: {cookie.name}")
# httpx 不能处理不同域名的同名 cookies,只好硬去重了
continue
assert cookie.value is not None
client.cookies.set(
cookie.name, cookie.value, domain=cookie.domain, path=cookie.path
)
)
loadded_keys.append(cookie.name)
loadded_cookies += 1
print(f'从 {cookies_path} 品尝了 {loadded_cookies} 块 cookies')
print(f"从 {cookies_path} 品尝了 {loadded_cookies} 块 cookies")
if loadded_cookies > 100:
print('吃了过多的 cookies,可能导致 httpx.Client 怠工,响应非常缓慢')
print("吃了过多的 cookies,可能导致 httpx.Client 怠工,响应非常缓慢")

assert client.cookies.get('SESSDATA') is not None, 'SESSDATA 不存在'
assert client.cookies.get("SESSDATA") is not None, "SESSDATA 不存在"
# print(f'SESS_DATA: {client.cookies.get("SESSDATA")}')


def is_login(cilent: Client) -> bool:
r = cilent.get('https://api.bilibili.com/x/member/web/account')
r = cilent.get("https://api.bilibili.com/x/member/web/account")
r.raise_for_status()
nav_json = r.json()
if nav_json['code'] == 0:
print('BiliBili 登录成功,饼干真香。')
print('NOTICE: 存档过程中请不要在 cookies 的源浏览器访问 B 站,避免 B 站刷新'
' cookies 导致我们半路下到的视频全是 480P 的优酷土豆级醇享画质。')
if nav_json["code"] == 0:
print("BiliBili 登录成功,饼干真香。")
print(
"NOTICE: 存档过程中请不要在 cookies 的源浏览器访问 B 站,避免 B 站刷新"
" cookies 导致我们半路下到的视频全是 480P 的优酷土豆级醇享画质。"
)
return True
print('未登录/SESSDATA无效/过期,你这饼干它保真吗?')
print("未登录/SESSDATA无效/过期,你这饼干它保真吗?")
return False

def main():
try:
_main()
except KeyboardInterrupt:
print('KeyboardInterrupt')
finally:
# 显示终端光标
console = Console()
console.show_cursor()

if __name__ == '__main__':
main()
Loading