Files
cube/apps/music/chord/yopu.py
T
Fam Zheng ceaa2cc839
deploy music / build-and-deploy (push) Successful in 1m50s
music(chord): 选搜索结果里的功能谱(数字级数版本),不要字母谱
yopu 搜索结果同一首歌通常有多个版本,区分方式:
- 字母谱:nier-snippet 里 SVG <text> 渲染 chord 字母(G/Em7/C 等)
- 功能谱:nier-snippet 里没 SVG <text>,直接 HTML/CSS 显示 1/4/5/6m

按 svgTextCount === 0 优先选第一个功能谱,没功能谱才 fallback 到字母谱。
view 页里没有「谱面样式」「和弦样式」row(要登录 APP 才有),所以这是唯一可行路径。

实测 独家记忆/倔强/Casablanca 三首都拿到正确的功能谱截图。
2026-05-09 23:15:41 +01:00

357 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
yopu.co 和弦谱抓取(v2
跟旧 guitar 版相比,UI 改了:现在是分立的 row:
- "谱面样式" → 选 "功能谱"
- "和弦样式" → 选 "级数名"
- "和弦图" → 默认(不动)
抓取流程:
1. /explore#q=<query> 搜索
2. 找第一个含「和弦谱」字样的结果 → 进 /view/<id>
3. 在 row label = X 的行里,点 button.option 文本 = Y
4. 撑开 div.sheet-container 容器把 overflow / max-height 砍掉,让全部内容渲染
5. 截图整个 container element
6. PIL 裁白边 + padding,存 PNG
"""
import os
import time
import logging
from pathlib import Path
from urllib.parse import quote, urlparse, urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from PIL import Image
logger = logging.getLogger(__name__)
def setup_driver(window="1920,5000"):
o = Options()
o.add_argument('--headless=new')
o.add_argument('--no-sandbox')
o.add_argument('--disable-dev-shm-usage')
o.add_argument('--disable-gpu')
o.add_argument(f'--window-size={window}')
o.add_argument('--lang=zh-CN')
o.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
o.add_experimental_option('prefs', {'intl.accept_languages': 'zh-CN,zh,en-US,en'})
service = None
if cdp := os.getenv('CHROMEDRIVER_PATH'):
service = Service(cdp)
if cb := os.getenv('CHROME_BIN'):
o.binary_location = cb
return webdriver.Chrome(service=service, options=o)
def find_first_chord_chart(driver, search_url):
"""在搜索页找最佳的「功能谱」结果。
yopu 现在搜索结果里同一首歌有多个版本:
- 字母谱(chord chart):nier-snippet 里有 SVG <text> 渲染的 chord 字母(G/Em7/C
- 功能谱(数字 / 级数):nier-snippet 里没 SVG <text>(用 HTML/CSS 显示数字 1/4/5
我们优先取第一个**功能谱**svgTextCount === 0),fallback 到第一个字母谱。
"""
logger.info("loading search: %s", search_url)
driver.get(search_url)
time.sleep(3)
hits = driver.execute_script("""
var out = [];
var posts = document.querySelectorAll('a.post-main');
for (var i = 0; i < posts.length; i++) {
var p = posts[i];
var titleEl = p.querySelector('.title-line .title, .title');
var subEl = p.querySelector('.title-line .subtitle, .subtitle');
var info = p.querySelector('.one-line-info');
var snippet = p.querySelector('.nier-snippet');
var svgTextCount = snippet ? snippet.querySelectorAll('svg text').length : 0;
out.push({
href: p.href,
title: titleEl ? (titleEl.textContent || '').trim() : '',
subtitle: subEl ? (subEl.textContent || '').trim() : '',
info: info ? (info.textContent || '').trim() : '',
isFunctional: svgTextCount === 0,
svgTextCount: svgTextCount,
});
}
return out;
""")
if not hits:
logger.warning("no a.post-main found — yopu DOM changed?")
return None
# 优先功能谱
functional = [h for h in hits if h['isFunctional']]
if functional:
chosen = functional[0]
kind = 'functional'
else:
chosen = hits[0]
kind = 'letter-chord (no functional version found)'
href = chosen['href']
if href.startswith('/'):
p = urlparse(search_url)
href = f"{p.scheme}://{p.netloc}{href}"
elif not href.startswith('http'):
href = urljoin(search_url, href)
logger.info("[%s] %s%s [%s] (%d total: %d functional, %d letter)",
kind, chosen['title'], chosen['subtitle'], chosen['info'],
len(hits), len(functional), len(hits) - len(functional))
return {
'url': href,
'title': chosen.get('title') or '',
'subtitle': chosen.get('subtitle') or '',
'text': chosen.get('info') or '',
'kind': kind,
}
def select_option_in_row(driver, row_label, button_text, timeout=10):
"""在 label 含 row_label 的 row 里,点 button.option 文本含 button_text 的按钮。
返回 True 表示点了;False 表示找不到(不算错误,可能是 UI 文案变了)。"""
wait = WebDriverWait(driver, timeout)
try:
row = wait.until(EC.presence_of_element_located((
By.XPATH,
f"//div[contains(@class, 'row')][.//div[contains(@class, 'label') "
f"and contains(normalize-space(.), '{row_label}')]]"
)))
except TimeoutException:
logger.warning("row '%s' not found", row_label)
return False
buttons = row.find_elements(By.CSS_SELECTOR, "button.option, button")
for btn in buttons:
txt = (btn.text or '').strip()
if button_text in txt:
try:
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn)
time.sleep(0.3)
btn.click()
logger.info("clicked '%s' in row '%s'", button_text, row_label)
time.sleep(1.2)
return True
except Exception as e:
logger.warning("click failed in row '%s' / '%s': %s", row_label, button_text, e)
return False
logger.warning("button '%s' not found in row '%s' (had: %s)",
button_text, row_label, [(b.text or '').strip() for b in buttons])
return False
def expand_sheet_container(driver, container):
"""把 sheet-container 跟它的祖先一起把 overflow / max-height 拆掉,
让 scrollHeight 全暴露,截图能拿到完整谱面。"""
return driver.execute_script("""
var c = arguments[0];
var origStyle = c.getAttribute('style') || '';
var modified = [];
var node = c;
while (node && node !== document.body) {
var cs = window.getComputedStyle(node);
if (cs.overflow === 'hidden' || cs.overflow === 'auto'
|| cs.overflowY === 'hidden' || cs.overflowY === 'auto'
|| cs.maxHeight !== 'none') {
modified.push({ el: node, orig: node.getAttribute('style') || '' });
node.style.overflow = 'visible';
node.style.overflowY = 'visible';
node.style.maxHeight = 'none';
node.style.height = 'auto';
}
node = node.parentElement;
}
c.style.overflow = 'visible';
c.style.maxHeight = 'none';
c.style.height = 'auto';
c.style.minHeight = c.scrollHeight + 'px';
c.offsetHeight; // force reflow
c.setAttribute('data-orig-style', origStyle);
window.__yopuModified = modified;
return { scrollHeight: c.scrollHeight, modified: modified.length };
""", container)
def crop_white(path, pad_top=20, pad_bottom=50, pad_left=20, pad_right=20, white_th=250):
"""裁掉四边的白边,加点 padding。"""
img = Image.open(path)
w, h = img.size
if img.mode != 'RGB':
img = img.convert('RGB')
px = img.load()
def row_white_ratio(y):
wp = 0
for x in range(w):
r, g, b = px[x, y]
if r > white_th and g > white_th and b > white_th:
wp += 1
return wp / w
def col_white_ratio(x, y0, y1):
wp = 0
rng = max(1, y1 - y0)
for y in range(y0, y1):
r, g, b = px[x, y]
if r > white_th and g > white_th and b > white_th:
wp += 1
return wp / rng
top = 0
for y in range(h):
if row_white_ratio(y) < 0.99:
top = y
break
bottom = h
for y in range(h - 1, -1, -1):
if row_white_ratio(y) < 0.99:
bottom = y + 1
break
if top >= bottom:
return # all white, give up
left = 0
for x in range(w):
if col_white_ratio(x, top, bottom) < 0.99:
left = x
break
right = w
for x in range(w - 1, -1, -1):
if col_white_ratio(x, top, bottom) < 0.99:
right = x + 1
break
if left >= right:
return
box = (
max(0, left - pad_left),
max(0, top - pad_top),
min(w, right + pad_right),
min(h, bottom + pad_bottom),
)
img.crop(box).save(path, 'PNG')
logger.info("cropped to %s", box)
DEBUG_DIR = Path('/data/chord-debug')
def _save_debug(driver, tag: str):
"""失败时 dump 当前 HTML + 截图到 /data/chord-debug 方便排查。"""
try:
DEBUG_DIR.mkdir(parents=True, exist_ok=True)
ts = int(time.time())
(DEBUG_DIR / f'{tag}-{ts}.html').write_text(driver.page_source, encoding='utf-8')
driver.save_screenshot(str(DEBUG_DIR / f'{tag}-{ts}.png'))
logger.info("debug snapshot saved: %s/%s-%d.{html,png}", DEBUG_DIR, tag, ts)
except Exception as e:
logger.warning("debug snapshot failed: %s", e)
def fetch_chord_chart(query: str, output_path: str, *,
sheet_style: str = '功能谱',
chord_style: str = '级数名',
verbose: bool = False) -> tuple[bool, str]:
"""
搜 yopu.co、进 view 页、按 row 选样式、截图。
返回 (ok, msg)。msg 在失败时是错误说明。
"""
if verbose:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
else:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
search_url = f"https://yopu.co/explore#q={quote(query)}"
driver = None
try:
driver = setup_driver()
result = find_first_chord_chart(driver, search_url)
if not result:
_save_debug(driver, 'no-search-hit')
return False, '未找到和弦谱'
view_url = result['url']
logger.info("loading view: %s", view_url)
driver.get(view_url)
time.sleep(3)
# 选样式(写死的 MVP 组合)
sheet_ok = select_option_in_row(driver, '谱面样式', sheet_style)
chord_ok = select_option_in_row(driver, '和弦样式', chord_style)
if not (sheet_ok and chord_ok):
# 选不上 = UI 改了,dump 给 debug
_save_debug(driver, 'row-not-found')
# 等内容刷新
time.sleep(1.5)
wait = WebDriverWait(driver, 15)
try:
sheet = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, "div.sheet-container")
))
except TimeoutException:
_save_debug(driver, 'no-sheet-container')
raise
driver.execute_script("arguments[0].scrollIntoView(true);", sheet)
time.sleep(0.5)
dims = expand_sheet_container(driver, sheet)
logger.debug("expanded scrollHeight=%s, modified=%s ancestors", dims['scrollHeight'], dims['modified'])
time.sleep(1.5)
# incrButton:放大字号 / chord size,跟旧版一样点 3 次
try:
buttons = driver.find_elements(By.CSS_SELECTOR, "button.incrButton")
if buttons:
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", buttons[0])
time.sleep(0.3)
for _ in range(3):
buttons[0].click()
time.sleep(0.4)
except Exception as e:
logger.warning("incrButton failed: %s", e)
time.sleep(1.0)
# 滚 sheet 内部回到顶部,截整个 container
driver.execute_script("arguments[0].scrollTop = 0;", sheet)
time.sleep(0.4)
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
sheet.screenshot(str(out))
if not out.exists() or out.stat().st_size < 100:
return False, '截图为空'
logger.info("screenshot: %s (%d bytes)", out, out.stat().st_size)
try:
crop_white(str(out))
except Exception as e:
logger.warning("crop failed: %s", e)
return True, str(out)
except Exception as e:
logger.error("fetch failed: %s", e, exc_info=True)
return False, str(e)
finally:
if driver:
try:
driver.quit()
except Exception:
pass