Files
Fam Zheng fd80116168
deploy music / build-and-deploy (push) Successful in 1m54s
music(chord): 拆两个 tab + 抓两种 (letters/functional)
- yopu 切 /song?title=&artist= 搜索(避免歌手词被搜糊)
- 抓的版本按搜索结果 nier-snippet svg <text> 数区分:
  >0 = 字母谱 (G/Em7/C 弹唱谱);==0 = 功能谱 (1/4/5/6m 数字级数)
- sidecar fetch/status/state/image 都走 (id, mode) 维度,文件落 /data/chord-fetch/{id}-{mode}.png
- backend chord_fetch / chord_status 接 ?mode=letters|functional,import 时 role 分别为 chord_letters / chord_functional
- 前端 chord tab 拆「吉他谱」+「功能谱」,state/error/poll 各自独立;旧 role='chord' 显示在「吉他谱」兼容历史 import
- verified 标记探测:匿名访问 yopu HTML 里 0 hits(要登录可见),暂时只能按 svg_text 区分
2026-05-10 15:10:03 +01:00

404 lines
15 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
yopu.co 和弦谱抓取(v2
跟旧 guitar 版相比,UI 改了:现在是分立的 row:
- "谱面样式" → 选 "功能谱"
- "和弦样式" → 选 "级数名"
- "和弦图" → 默认(不动)
抓取流程:
1. /explore#q=<query> 搜索
2. 找第一个含「和弦谱」字样的结果 → 进 /view/<id>
3. 在 row label = X 的行里,点 button.option 文本 = Y
4. 撑开 div.sheet-container 容器把 overflow / max-height 砍掉,让全部内容渲染
5. 截图整个 container element
6. PIL 裁白边 + padding,存 PNG
"""
import os
import time
import logging
from pathlib import Path
from urllib.parse import quote, urlparse, urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from PIL import Image
logger = logging.getLogger(__name__)
def setup_driver(window="1920,5000"):
o = Options()
o.add_argument('--headless=new')
o.add_argument('--no-sandbox')
o.add_argument('--disable-dev-shm-usage')
o.add_argument('--disable-gpu')
o.add_argument(f'--window-size={window}')
o.add_argument('--lang=zh-CN')
o.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
o.add_experimental_option('prefs', {'intl.accept_languages': 'zh-CN,zh,en-US,en'})
service = None
if cdp := os.getenv('CHROMEDRIVER_PATH'):
service = Service(cdp)
if cb := os.getenv('CHROME_BIN'):
o.binary_location = cb
return webdriver.Chrome(service=service, options=o)
def find_chart(driver, title: str, artist: str, prefer: str = 'functional'):
"""在 /song?title=&artist= 找最佳候选 view。
yopu 同一首歌一般有多个版本,按搜索结果里 nier-snippet 内的
SVG <text> 数量区分:
- svg_text > 0 → chord 字母版(G/Em7/C),民间叫弹唱谱
- svg_text == 0 → 功能谱 / 数字级数版
`prefer` ∈ {'letters', 'functional'},按需求挑第一个匹配的。
实在没匹配就 fallback 到第一个非空候选。
"""
from urllib.parse import urlencode
base = 'https://yopu.co/song'
# /song 用 hash 传参(跟 yopu 前端约定一致)
search_url = f"{base}#title={quote(title)}&artist={quote(artist)}"
logger.info("loading /song: %s", search_url)
driver.get(search_url)
time.sleep(3)
hits = driver.execute_script("""
var out = [];
var posts = document.querySelectorAll('a.post-main');
for (var i = 0; i < posts.length; i++) {
var p = posts[i];
var titleEl = p.querySelector('.title-line .title, .title');
var subEl = p.querySelector('.title-line .subtitle, .subtitle');
var info = p.querySelector('.one-line-info');
var snippet = p.querySelector('.nier-snippet');
var svgTextCount = snippet ? snippet.querySelectorAll('svg text').length : 0;
// 任何子元素 class 含 'verified' 都算(svelte 加了 hash class
var isVerified = p.querySelectorAll('[class*="verified"]').length > 0;
out.push({
href: p.href,
title: titleEl ? (titleEl.textContent || '').trim() : '',
subtitle: subEl ? (subEl.textContent || '').trim() : '',
info: info ? (info.textContent || '').trim() : '',
svgTextCount: svgTextCount,
isLetters: svgTextCount > 0,
isFunctional: svgTextCount === 0,
isVerified: isVerified,
});
}
return out;
""")
if not hits:
logger.warning("no a.post-main found at /song — fallback to /explore")
# fallback: yopu /song 偶尔没结果,回退到 /explore
from urllib.parse import quote as _q
q = (artist + ' ' + title).strip()
driver.get(f"https://yopu.co/explore#q={_q(q)}")
time.sleep(3)
hits = driver.execute_script("""
var out = [];
var posts = document.querySelectorAll('a.post-main');
for (var i = 0; i < posts.length; i++) {
var p = posts[i];
var titleEl = p.querySelector('.title-line .title, .title');
var subEl = p.querySelector('.title-line .subtitle, .subtitle');
var info = p.querySelector('.one-line-info');
var snippet = p.querySelector('.nier-snippet');
var svgTextCount = snippet ? snippet.querySelectorAll('svg text').length : 0;
out.push({
href: p.href,
title: titleEl ? (titleEl.textContent || '').trim() : '',
subtitle: subEl ? (subEl.textContent || '').trim() : '',
info: info ? (info.textContent || '').trim() : '',
svgTextCount: svgTextCount,
isLetters: svgTextCount > 0,
isFunctional: svgTextCount === 0,
isVerified: false,
});
}
return out;
""")
if not hits:
return None
# 优先匹配 prefer;同时优先 verified(虽然匿名访问大概率全是 false)
def _key(h):
match_pref = (prefer == 'letters' and h['isLetters']) or \
(prefer == 'functional' and h['isFunctional'])
# 数值越小越优先:first match_pref+verified, then match_pref, then verified, then all
return (0 if (match_pref and h['isVerified']) else
1 if match_pref else
2 if h['isVerified'] else 3)
sorted_hits = sorted(hits, key=_key)
chosen = sorted_hits[0]
matched = (prefer == 'letters' and chosen['isLetters']) or \
(prefer == 'functional' and chosen['isFunctional'])
kind = prefer if matched else f"{prefer}-fallback"
href = chosen['href']
if href.startswith('/'):
p = urlparse(driver.current_url)
href = f"{p.scheme}://{p.netloc}{href}"
elif not href.startswith('http'):
href = urljoin(driver.current_url, href)
logger.info("[%s] %s%s [%s] verified=%s (total %d, letters=%d, functional=%d, verified=%d)",
kind, chosen['title'], chosen['subtitle'], chosen['info'],
chosen['isVerified'], len(hits),
sum(1 for h in hits if h['isLetters']),
sum(1 for h in hits if h['isFunctional']),
sum(1 for h in hits if h['isVerified']))
return {
'url': href,
'title': chosen.get('title') or '',
'subtitle': chosen.get('subtitle') or '',
'text': chosen.get('info') or '',
'kind': kind,
}
def select_option_in_row(driver, row_label, button_text, timeout=10):
"""在 label 含 row_label 的 row 里,点 button.option 文本含 button_text 的按钮。
返回 True 表示点了;False 表示找不到(不算错误,可能是 UI 文案变了)。"""
# 短 timeout:当前 yopu UI 普遍没这些 rowbest-effort 不卡流程
wait = WebDriverWait(driver, min(timeout, 3))
try:
row = wait.until(EC.presence_of_element_located((
By.XPATH,
f"//div[contains(@class, 'row')][.//div[contains(@class, 'label') "
f"and contains(normalize-space(.), '{row_label}')]]"
)))
except TimeoutException:
logger.debug("row '%s' not present (skipped)", row_label)
return False
buttons = row.find_elements(By.CSS_SELECTOR, "button.option, button")
for btn in buttons:
txt = (btn.text or '').strip()
if button_text in txt:
try:
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn)
time.sleep(0.3)
btn.click()
logger.info("clicked '%s' in row '%s'", button_text, row_label)
time.sleep(1.2)
return True
except Exception as e:
logger.debug("click failed in row '%s' / '%s': %s", row_label, button_text, e)
return False
logger.debug("button '%s' not found in row '%s'", button_text, row_label)
return False
def expand_sheet_container(driver, container):
"""把 sheet-container 跟它的祖先一起把 overflow / max-height 拆掉,
让 scrollHeight 全暴露,截图能拿到完整谱面。"""
return driver.execute_script("""
var c = arguments[0];
var origStyle = c.getAttribute('style') || '';
var modified = [];
var node = c;
while (node && node !== document.body) {
var cs = window.getComputedStyle(node);
if (cs.overflow === 'hidden' || cs.overflow === 'auto'
|| cs.overflowY === 'hidden' || cs.overflowY === 'auto'
|| cs.maxHeight !== 'none') {
modified.push({ el: node, orig: node.getAttribute('style') || '' });
node.style.overflow = 'visible';
node.style.overflowY = 'visible';
node.style.maxHeight = 'none';
node.style.height = 'auto';
}
node = node.parentElement;
}
c.style.overflow = 'visible';
c.style.maxHeight = 'none';
c.style.height = 'auto';
c.style.minHeight = c.scrollHeight + 'px';
c.offsetHeight; // force reflow
c.setAttribute('data-orig-style', origStyle);
window.__yopuModified = modified;
return { scrollHeight: c.scrollHeight, modified: modified.length };
""", container)
def crop_white(path, pad_top=20, pad_bottom=50, pad_left=20, pad_right=20, white_th=250):
"""裁掉四边的白边,加点 padding。"""
img = Image.open(path)
w, h = img.size
if img.mode != 'RGB':
img = img.convert('RGB')
px = img.load()
def row_white_ratio(y):
wp = 0
for x in range(w):
r, g, b = px[x, y]
if r > white_th and g > white_th and b > white_th:
wp += 1
return wp / w
def col_white_ratio(x, y0, y1):
wp = 0
rng = max(1, y1 - y0)
for y in range(y0, y1):
r, g, b = px[x, y]
if r > white_th and g > white_th and b > white_th:
wp += 1
return wp / rng
top = 0
for y in range(h):
if row_white_ratio(y) < 0.99:
top = y
break
bottom = h
for y in range(h - 1, -1, -1):
if row_white_ratio(y) < 0.99:
bottom = y + 1
break
if top >= bottom:
return # all white, give up
left = 0
for x in range(w):
if col_white_ratio(x, top, bottom) < 0.99:
left = x
break
right = w
for x in range(w - 1, -1, -1):
if col_white_ratio(x, top, bottom) < 0.99:
right = x + 1
break
if left >= right:
return
box = (
max(0, left - pad_left),
max(0, top - pad_top),
min(w, right + pad_right),
min(h, bottom + pad_bottom),
)
img.crop(box).save(path, 'PNG')
logger.info("cropped to %s", box)
DEBUG_DIR = Path('/data/chord-debug')
def _save_debug(driver, tag: str):
"""失败时 dump 当前 HTML + 截图到 /data/chord-debug 方便排查。"""
try:
DEBUG_DIR.mkdir(parents=True, exist_ok=True)
ts = int(time.time())
(DEBUG_DIR / f'{tag}-{ts}.html').write_text(driver.page_source, encoding='utf-8')
driver.save_screenshot(str(DEBUG_DIR / f'{tag}-{ts}.png'))
logger.info("debug snapshot saved: %s/%s-%d.{html,png}", DEBUG_DIR, tag, ts)
except Exception as e:
logger.warning("debug snapshot failed: %s", e)
def fetch_chord_chart(title: str, artist: str, output_path: str, *,
mode: str = 'functional',
sheet_style: str = '功能谱',
chord_style: str = '级数名',
verbose: bool = False) -> tuple[bool, str]:
"""搜 yopu /song、按 mode 挑候选 view、截图。
mode='functional' → 数字级数版;mode='letters' → 字母版(弹唱谱)。
返回 (ok, msg)。
"""
if verbose:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
else:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
driver = None
try:
driver = setup_driver()
result = find_chart(driver, title, artist, prefer=mode)
if not result:
_save_debug(driver, 'no-search-hit')
return False, '未找到和弦谱'
view_url = result['url']
logger.info("loading view: %s", view_url)
driver.get(view_url)
time.sleep(3)
# 旧 yopu UI 在 view 页有「谱面样式 / 和弦样式」row 可切;
# 新 yopu 已经下线了这些(要登录 APP 才能切),所以用搜索阶段
# 选「功能谱」版本绕过去。这里 best-effort 试一下,找不到不算错误。
select_option_in_row(driver, '谱面样式', sheet_style)
select_option_in_row(driver, '和弦样式', chord_style)
# 等内容刷新
time.sleep(1.5)
wait = WebDriverWait(driver, 15)
try:
sheet = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, "div.sheet-container")
))
except TimeoutException:
_save_debug(driver, 'no-sheet-container')
raise
driver.execute_script("arguments[0].scrollIntoView(true);", sheet)
time.sleep(0.5)
dims = expand_sheet_container(driver, sheet)
logger.debug("expanded scrollHeight=%s, modified=%s ancestors", dims['scrollHeight'], dims['modified'])
time.sleep(1.5)
# incrButton:放大字号 / chord size,跟旧版一样点 3 次
try:
buttons = driver.find_elements(By.CSS_SELECTOR, "button.incrButton")
if buttons:
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", buttons[0])
time.sleep(0.3)
for _ in range(3):
buttons[0].click()
time.sleep(0.4)
except Exception as e:
logger.warning("incrButton failed: %s", e)
time.sleep(1.0)
# 滚 sheet 内部回到顶部,截整个 container
driver.execute_script("arguments[0].scrollTop = 0;", sheet)
time.sleep(0.4)
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
sheet.screenshot(str(out))
if not out.exists() or out.stat().st_size < 100:
return False, '截图为空'
logger.info("screenshot: %s (%d bytes)", out, out.stat().st_size)
try:
crop_white(str(out))
except Exception as e:
logger.warning("crop failed: %s", e)
return True, str(out)
except Exception as e:
logger.error("fetch failed: %s", e, exc_info=True)
return False, str(e)
finally:
if driver:
try:
driver.quit()
except Exception:
pass