fd80116168
deploy music / build-and-deploy (push) Successful in 1m54s
- yopu 切 /song?title=&artist= 搜索(避免歌手词被搜糊)
- 抓的版本按搜索结果 nier-snippet svg <text> 数区分:
>0 = 字母谱 (G/Em7/C 弹唱谱);==0 = 功能谱 (1/4/5/6m 数字级数)
- sidecar fetch/status/state/image 都走 (id, mode) 维度,文件落 /data/chord-fetch/{id}-{mode}.png
- backend chord_fetch / chord_status 接 ?mode=letters|functional,import 时 role 分别为 chord_letters / chord_functional
- 前端 chord tab 拆「吉他谱」+「功能谱」,state/error/poll 各自独立;旧 role='chord' 显示在「吉他谱」兼容历史 import
- verified 标记探测:匿名访问 yopu HTML 里 0 hits(要登录可见),暂时只能按 svg_text 区分
404 lines
15 KiB
Python
404 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
yopu.co 和弦谱抓取(v2)
|
||
|
||
跟旧 guitar 版相比,UI 改了:现在是分立的 row:
|
||
- "谱面样式" → 选 "功能谱"
|
||
- "和弦样式" → 选 "级数名"
|
||
- "和弦图" → 默认(不动)
|
||
|
||
抓取流程:
|
||
1. /explore#q=<query> 搜索
|
||
2. 找第一个含「和弦谱」字样的结果 → 进 /view/<id>
|
||
3. 在 row label = X 的行里,点 button.option 文本 = Y
|
||
4. 撑开 div.sheet-container 容器把 overflow / max-height 砍掉,让全部内容渲染
|
||
5. 截图整个 container element
|
||
6. PIL 裁白边 + padding,存 PNG
|
||
"""
|
||
|
||
import os
|
||
import time
|
||
import logging
|
||
from pathlib import Path
|
||
from urllib.parse import quote, urlparse, urljoin
|
||
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.service import Service
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.common.exceptions import TimeoutException
|
||
from PIL import Image
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def setup_driver(window="1920,5000"):
|
||
o = Options()
|
||
o.add_argument('--headless=new')
|
||
o.add_argument('--no-sandbox')
|
||
o.add_argument('--disable-dev-shm-usage')
|
||
o.add_argument('--disable-gpu')
|
||
o.add_argument(f'--window-size={window}')
|
||
o.add_argument('--lang=zh-CN')
|
||
o.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
|
||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
||
o.add_experimental_option('prefs', {'intl.accept_languages': 'zh-CN,zh,en-US,en'})
|
||
|
||
service = None
|
||
if cdp := os.getenv('CHROMEDRIVER_PATH'):
|
||
service = Service(cdp)
|
||
if cb := os.getenv('CHROME_BIN'):
|
||
o.binary_location = cb
|
||
return webdriver.Chrome(service=service, options=o)
|
||
|
||
|
||
def find_chart(driver, title: str, artist: str, prefer: str = 'functional'):
|
||
"""在 /song?title=&artist= 找最佳候选 view。
|
||
|
||
yopu 同一首歌一般有多个版本,按搜索结果里 nier-snippet 内的
|
||
SVG <text> 数量区分:
|
||
- svg_text > 0 → chord 字母版(G/Em7/C),民间叫弹唱谱
|
||
- svg_text == 0 → 功能谱 / 数字级数版
|
||
|
||
`prefer` ∈ {'letters', 'functional'},按需求挑第一个匹配的。
|
||
实在没匹配就 fallback 到第一个非空候选。
|
||
"""
|
||
from urllib.parse import urlencode
|
||
base = 'https://yopu.co/song'
|
||
# /song 用 hash 传参(跟 yopu 前端约定一致)
|
||
search_url = f"{base}#title={quote(title)}&artist={quote(artist)}"
|
||
logger.info("loading /song: %s", search_url)
|
||
driver.get(search_url)
|
||
time.sleep(3)
|
||
|
||
hits = driver.execute_script("""
|
||
var out = [];
|
||
var posts = document.querySelectorAll('a.post-main');
|
||
for (var i = 0; i < posts.length; i++) {
|
||
var p = posts[i];
|
||
var titleEl = p.querySelector('.title-line .title, .title');
|
||
var subEl = p.querySelector('.title-line .subtitle, .subtitle');
|
||
var info = p.querySelector('.one-line-info');
|
||
var snippet = p.querySelector('.nier-snippet');
|
||
var svgTextCount = snippet ? snippet.querySelectorAll('svg text').length : 0;
|
||
// 任何子元素 class 含 'verified' 都算(svelte 加了 hash class)
|
||
var isVerified = p.querySelectorAll('[class*="verified"]').length > 0;
|
||
out.push({
|
||
href: p.href,
|
||
title: titleEl ? (titleEl.textContent || '').trim() : '',
|
||
subtitle: subEl ? (subEl.textContent || '').trim() : '',
|
||
info: info ? (info.textContent || '').trim() : '',
|
||
svgTextCount: svgTextCount,
|
||
isLetters: svgTextCount > 0,
|
||
isFunctional: svgTextCount === 0,
|
||
isVerified: isVerified,
|
||
});
|
||
}
|
||
return out;
|
||
""")
|
||
|
||
if not hits:
|
||
logger.warning("no a.post-main found at /song — fallback to /explore")
|
||
# fallback: yopu /song 偶尔没结果,回退到 /explore
|
||
from urllib.parse import quote as _q
|
||
q = (artist + ' ' + title).strip()
|
||
driver.get(f"https://yopu.co/explore#q={_q(q)}")
|
||
time.sleep(3)
|
||
hits = driver.execute_script("""
|
||
var out = [];
|
||
var posts = document.querySelectorAll('a.post-main');
|
||
for (var i = 0; i < posts.length; i++) {
|
||
var p = posts[i];
|
||
var titleEl = p.querySelector('.title-line .title, .title');
|
||
var subEl = p.querySelector('.title-line .subtitle, .subtitle');
|
||
var info = p.querySelector('.one-line-info');
|
||
var snippet = p.querySelector('.nier-snippet');
|
||
var svgTextCount = snippet ? snippet.querySelectorAll('svg text').length : 0;
|
||
out.push({
|
||
href: p.href,
|
||
title: titleEl ? (titleEl.textContent || '').trim() : '',
|
||
subtitle: subEl ? (subEl.textContent || '').trim() : '',
|
||
info: info ? (info.textContent || '').trim() : '',
|
||
svgTextCount: svgTextCount,
|
||
isLetters: svgTextCount > 0,
|
||
isFunctional: svgTextCount === 0,
|
||
isVerified: false,
|
||
});
|
||
}
|
||
return out;
|
||
""")
|
||
if not hits:
|
||
return None
|
||
|
||
# 优先匹配 prefer;同时优先 verified(虽然匿名访问大概率全是 false)
|
||
def _key(h):
|
||
match_pref = (prefer == 'letters' and h['isLetters']) or \
|
||
(prefer == 'functional' and h['isFunctional'])
|
||
# 数值越小越优先:first match_pref+verified, then match_pref, then verified, then all
|
||
return (0 if (match_pref and h['isVerified']) else
|
||
1 if match_pref else
|
||
2 if h['isVerified'] else 3)
|
||
|
||
sorted_hits = sorted(hits, key=_key)
|
||
chosen = sorted_hits[0]
|
||
matched = (prefer == 'letters' and chosen['isLetters']) or \
|
||
(prefer == 'functional' and chosen['isFunctional'])
|
||
kind = prefer if matched else f"{prefer}-fallback"
|
||
|
||
href = chosen['href']
|
||
if href.startswith('/'):
|
||
p = urlparse(driver.current_url)
|
||
href = f"{p.scheme}://{p.netloc}{href}"
|
||
elif not href.startswith('http'):
|
||
href = urljoin(driver.current_url, href)
|
||
logger.info("[%s] %s — %s [%s] verified=%s (total %d, letters=%d, functional=%d, verified=%d)",
|
||
kind, chosen['title'], chosen['subtitle'], chosen['info'],
|
||
chosen['isVerified'], len(hits),
|
||
sum(1 for h in hits if h['isLetters']),
|
||
sum(1 for h in hits if h['isFunctional']),
|
||
sum(1 for h in hits if h['isVerified']))
|
||
return {
|
||
'url': href,
|
||
'title': chosen.get('title') or '',
|
||
'subtitle': chosen.get('subtitle') or '',
|
||
'text': chosen.get('info') or '',
|
||
'kind': kind,
|
||
}
|
||
|
||
|
||
def select_option_in_row(driver, row_label, button_text, timeout=10):
|
||
"""在 label 含 row_label 的 row 里,点 button.option 文本含 button_text 的按钮。
|
||
返回 True 表示点了;False 表示找不到(不算错误,可能是 UI 文案变了)。"""
|
||
# 短 timeout:当前 yopu UI 普遍没这些 row,best-effort 不卡流程
|
||
wait = WebDriverWait(driver, min(timeout, 3))
|
||
try:
|
||
row = wait.until(EC.presence_of_element_located((
|
||
By.XPATH,
|
||
f"//div[contains(@class, 'row')][.//div[contains(@class, 'label') "
|
||
f"and contains(normalize-space(.), '{row_label}')]]"
|
||
)))
|
||
except TimeoutException:
|
||
logger.debug("row '%s' not present (skipped)", row_label)
|
||
return False
|
||
|
||
buttons = row.find_elements(By.CSS_SELECTOR, "button.option, button")
|
||
for btn in buttons:
|
||
txt = (btn.text or '').strip()
|
||
if button_text in txt:
|
||
try:
|
||
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn)
|
||
time.sleep(0.3)
|
||
btn.click()
|
||
logger.info("clicked '%s' in row '%s'", button_text, row_label)
|
||
time.sleep(1.2)
|
||
return True
|
||
except Exception as e:
|
||
logger.debug("click failed in row '%s' / '%s': %s", row_label, button_text, e)
|
||
return False
|
||
logger.debug("button '%s' not found in row '%s'", button_text, row_label)
|
||
return False
|
||
|
||
|
||
def expand_sheet_container(driver, container):
|
||
"""把 sheet-container 跟它的祖先一起把 overflow / max-height 拆掉,
|
||
让 scrollHeight 全暴露,截图能拿到完整谱面。"""
|
||
return driver.execute_script("""
|
||
var c = arguments[0];
|
||
var origStyle = c.getAttribute('style') || '';
|
||
var modified = [];
|
||
var node = c;
|
||
while (node && node !== document.body) {
|
||
var cs = window.getComputedStyle(node);
|
||
if (cs.overflow === 'hidden' || cs.overflow === 'auto'
|
||
|| cs.overflowY === 'hidden' || cs.overflowY === 'auto'
|
||
|| cs.maxHeight !== 'none') {
|
||
modified.push({ el: node, orig: node.getAttribute('style') || '' });
|
||
node.style.overflow = 'visible';
|
||
node.style.overflowY = 'visible';
|
||
node.style.maxHeight = 'none';
|
||
node.style.height = 'auto';
|
||
}
|
||
node = node.parentElement;
|
||
}
|
||
c.style.overflow = 'visible';
|
||
c.style.maxHeight = 'none';
|
||
c.style.height = 'auto';
|
||
c.style.minHeight = c.scrollHeight + 'px';
|
||
c.offsetHeight; // force reflow
|
||
c.setAttribute('data-orig-style', origStyle);
|
||
window.__yopuModified = modified;
|
||
return { scrollHeight: c.scrollHeight, modified: modified.length };
|
||
""", container)
|
||
|
||
|
||
def crop_white(path, pad_top=20, pad_bottom=50, pad_left=20, pad_right=20, white_th=250):
|
||
"""裁掉四边的白边,加点 padding。"""
|
||
img = Image.open(path)
|
||
w, h = img.size
|
||
if img.mode != 'RGB':
|
||
img = img.convert('RGB')
|
||
px = img.load()
|
||
|
||
def row_white_ratio(y):
|
||
wp = 0
|
||
for x in range(w):
|
||
r, g, b = px[x, y]
|
||
if r > white_th and g > white_th and b > white_th:
|
||
wp += 1
|
||
return wp / w
|
||
|
||
def col_white_ratio(x, y0, y1):
|
||
wp = 0
|
||
rng = max(1, y1 - y0)
|
||
for y in range(y0, y1):
|
||
r, g, b = px[x, y]
|
||
if r > white_th and g > white_th and b > white_th:
|
||
wp += 1
|
||
return wp / rng
|
||
|
||
top = 0
|
||
for y in range(h):
|
||
if row_white_ratio(y) < 0.99:
|
||
top = y
|
||
break
|
||
bottom = h
|
||
for y in range(h - 1, -1, -1):
|
||
if row_white_ratio(y) < 0.99:
|
||
bottom = y + 1
|
||
break
|
||
if top >= bottom:
|
||
return # all white, give up
|
||
|
||
left = 0
|
||
for x in range(w):
|
||
if col_white_ratio(x, top, bottom) < 0.99:
|
||
left = x
|
||
break
|
||
right = w
|
||
for x in range(w - 1, -1, -1):
|
||
if col_white_ratio(x, top, bottom) < 0.99:
|
||
right = x + 1
|
||
break
|
||
if left >= right:
|
||
return
|
||
|
||
box = (
|
||
max(0, left - pad_left),
|
||
max(0, top - pad_top),
|
||
min(w, right + pad_right),
|
||
min(h, bottom + pad_bottom),
|
||
)
|
||
img.crop(box).save(path, 'PNG')
|
||
logger.info("cropped to %s", box)
|
||
|
||
|
||
DEBUG_DIR = Path('/data/chord-debug')
|
||
|
||
|
||
def _save_debug(driver, tag: str):
|
||
"""失败时 dump 当前 HTML + 截图到 /data/chord-debug 方便排查。"""
|
||
try:
|
||
DEBUG_DIR.mkdir(parents=True, exist_ok=True)
|
||
ts = int(time.time())
|
||
(DEBUG_DIR / f'{tag}-{ts}.html').write_text(driver.page_source, encoding='utf-8')
|
||
driver.save_screenshot(str(DEBUG_DIR / f'{tag}-{ts}.png'))
|
||
logger.info("debug snapshot saved: %s/%s-%d.{html,png}", DEBUG_DIR, tag, ts)
|
||
except Exception as e:
|
||
logger.warning("debug snapshot failed: %s", e)
|
||
|
||
|
||
def fetch_chord_chart(title: str, artist: str, output_path: str, *,
|
||
mode: str = 'functional',
|
||
sheet_style: str = '功能谱',
|
||
chord_style: str = '级数名',
|
||
verbose: bool = False) -> tuple[bool, str]:
|
||
"""搜 yopu /song、按 mode 挑候选 view、截图。
|
||
mode='functional' → 数字级数版;mode='letters' → 字母版(弹唱谱)。
|
||
返回 (ok, msg)。
|
||
"""
|
||
if verbose:
|
||
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
|
||
else:
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
|
||
|
||
driver = None
|
||
try:
|
||
driver = setup_driver()
|
||
result = find_chart(driver, title, artist, prefer=mode)
|
||
if not result:
|
||
_save_debug(driver, 'no-search-hit')
|
||
return False, '未找到和弦谱'
|
||
view_url = result['url']
|
||
|
||
logger.info("loading view: %s", view_url)
|
||
driver.get(view_url)
|
||
time.sleep(3)
|
||
|
||
# 旧 yopu UI 在 view 页有「谱面样式 / 和弦样式」row 可切;
|
||
# 新 yopu 已经下线了这些(要登录 APP 才能切),所以用搜索阶段
|
||
# 选「功能谱」版本绕过去。这里 best-effort 试一下,找不到不算错误。
|
||
select_option_in_row(driver, '谱面样式', sheet_style)
|
||
select_option_in_row(driver, '和弦样式', chord_style)
|
||
|
||
# 等内容刷新
|
||
time.sleep(1.5)
|
||
|
||
wait = WebDriverWait(driver, 15)
|
||
try:
|
||
sheet = wait.until(EC.presence_of_element_located(
|
||
(By.CSS_SELECTOR, "div.sheet-container")
|
||
))
|
||
except TimeoutException:
|
||
_save_debug(driver, 'no-sheet-container')
|
||
raise
|
||
|
||
driver.execute_script("arguments[0].scrollIntoView(true);", sheet)
|
||
time.sleep(0.5)
|
||
|
||
dims = expand_sheet_container(driver, sheet)
|
||
logger.debug("expanded scrollHeight=%s, modified=%s ancestors", dims['scrollHeight'], dims['modified'])
|
||
time.sleep(1.5)
|
||
|
||
# incrButton:放大字号 / chord size,跟旧版一样点 3 次
|
||
try:
|
||
buttons = driver.find_elements(By.CSS_SELECTOR, "button.incrButton")
|
||
if buttons:
|
||
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", buttons[0])
|
||
time.sleep(0.3)
|
||
for _ in range(3):
|
||
buttons[0].click()
|
||
time.sleep(0.4)
|
||
except Exception as e:
|
||
logger.warning("incrButton failed: %s", e)
|
||
|
||
time.sleep(1.0)
|
||
|
||
# 滚 sheet 内部回到顶部,截整个 container
|
||
driver.execute_script("arguments[0].scrollTop = 0;", sheet)
|
||
time.sleep(0.4)
|
||
|
||
out = Path(output_path)
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
sheet.screenshot(str(out))
|
||
if not out.exists() or out.stat().st_size < 100:
|
||
return False, '截图为空'
|
||
logger.info("screenshot: %s (%d bytes)", out, out.stat().st_size)
|
||
|
||
try:
|
||
crop_white(str(out))
|
||
except Exception as e:
|
||
logger.warning("crop failed: %s", e)
|
||
|
||
return True, str(out)
|
||
except Exception as e:
|
||
logger.error("fetch failed: %s", e, exc_info=True)
|
||
return False, str(e)
|
||
finally:
|
||
if driver:
|
||
try:
|
||
driver.quit()
|
||
except Exception:
|
||
pass
|