356 lines
13 KiB
Python
356 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
yopu.co 和弦谱抓取(v2)
|
||
|
||
跟旧 guitar 版相比,UI 改了:现在是分立的 row:
|
||
- "谱面样式" → 选 "功能谱"
|
||
- "和弦样式" → 选 "级数名"
|
||
- "和弦图" → 默认(不动)
|
||
|
||
抓取流程:
|
||
1. /explore#q=<query> 搜索
|
||
2. 找第一个含「和弦谱」字样的结果 → 进 /view/<id>
|
||
3. 在 row label = X 的行里,点 button.option 文本 = Y
|
||
4. 撑开 div.sheet-container 容器把 overflow / max-height 砍掉,让全部内容渲染
|
||
5. 截图整个 container element
|
||
6. PIL 裁白边 + padding,存 PNG
|
||
"""
|
||
|
||
import os
|
||
import time
|
||
import logging
|
||
from pathlib import Path
|
||
from urllib.parse import quote, urlparse, urljoin
|
||
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.service import Service
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.common.exceptions import TimeoutException
|
||
from PIL import Image
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def setup_driver(window="1920,5000"):
|
||
o = Options()
|
||
o.add_argument('--headless=new')
|
||
o.add_argument('--no-sandbox')
|
||
o.add_argument('--disable-dev-shm-usage')
|
||
o.add_argument('--disable-gpu')
|
||
o.add_argument(f'--window-size={window}')
|
||
o.add_argument('--lang=zh-CN')
|
||
o.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
|
||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
||
o.add_experimental_option('prefs', {'intl.accept_languages': 'zh-CN,zh,en-US,en'})
|
||
|
||
service = None
|
||
if cdp := os.getenv('CHROMEDRIVER_PATH'):
|
||
service = Service(cdp)
|
||
if cb := os.getenv('CHROME_BIN'):
|
||
o.binary_location = cb
|
||
return webdriver.Chrome(service=service, options=o)
|
||
|
||
|
||
def find_first_chord_chart(driver, search_url):
|
||
"""在搜索页找最佳的「功能谱」结果。
|
||
|
||
yopu 现在搜索结果里同一首歌有多个版本:
|
||
- 字母谱(chord chart):nier-snippet 里有 SVG <text> 渲染的 chord 字母(G/Em7/C)
|
||
- 功能谱(数字 / 级数):nier-snippet 里没 SVG <text>(用 HTML/CSS 显示数字 1/4/5)
|
||
|
||
我们优先取第一个**功能谱**(svgTextCount === 0),fallback 到第一个字母谱。
|
||
"""
|
||
logger.info("loading search: %s", search_url)
|
||
driver.get(search_url)
|
||
time.sleep(3)
|
||
|
||
hits = driver.execute_script("""
|
||
var out = [];
|
||
var posts = document.querySelectorAll('a.post-main');
|
||
for (var i = 0; i < posts.length; i++) {
|
||
var p = posts[i];
|
||
var titleEl = p.querySelector('.title-line .title, .title');
|
||
var subEl = p.querySelector('.title-line .subtitle, .subtitle');
|
||
var info = p.querySelector('.one-line-info');
|
||
var snippet = p.querySelector('.nier-snippet');
|
||
var svgTextCount = snippet ? snippet.querySelectorAll('svg text').length : 0;
|
||
out.push({
|
||
href: p.href,
|
||
title: titleEl ? (titleEl.textContent || '').trim() : '',
|
||
subtitle: subEl ? (subEl.textContent || '').trim() : '',
|
||
info: info ? (info.textContent || '').trim() : '',
|
||
isFunctional: svgTextCount === 0,
|
||
svgTextCount: svgTextCount,
|
||
});
|
||
}
|
||
return out;
|
||
""")
|
||
|
||
if not hits:
|
||
logger.warning("no a.post-main found — yopu DOM changed?")
|
||
return None
|
||
|
||
# 优先功能谱
|
||
functional = [h for h in hits if h['isFunctional']]
|
||
if functional:
|
||
chosen = functional[0]
|
||
kind = 'functional'
|
||
else:
|
||
chosen = hits[0]
|
||
kind = 'letter-chord (no functional version found)'
|
||
|
||
href = chosen['href']
|
||
if href.startswith('/'):
|
||
p = urlparse(search_url)
|
||
href = f"{p.scheme}://{p.netloc}{href}"
|
||
elif not href.startswith('http'):
|
||
href = urljoin(search_url, href)
|
||
logger.info("[%s] %s — %s [%s] (%d total: %d functional, %d letter)",
|
||
kind, chosen['title'], chosen['subtitle'], chosen['info'],
|
||
len(hits), len(functional), len(hits) - len(functional))
|
||
return {
|
||
'url': href,
|
||
'title': chosen.get('title') or '',
|
||
'subtitle': chosen.get('subtitle') or '',
|
||
'text': chosen.get('info') or '',
|
||
'kind': kind,
|
||
}
|
||
|
||
|
||
def select_option_in_row(driver, row_label, button_text, timeout=10):
|
||
"""在 label 含 row_label 的 row 里,点 button.option 文本含 button_text 的按钮。
|
||
返回 True 表示点了;False 表示找不到(不算错误,可能是 UI 文案变了)。"""
|
||
# 短 timeout:当前 yopu UI 普遍没这些 row,best-effort 不卡流程
|
||
wait = WebDriverWait(driver, min(timeout, 3))
|
||
try:
|
||
row = wait.until(EC.presence_of_element_located((
|
||
By.XPATH,
|
||
f"//div[contains(@class, 'row')][.//div[contains(@class, 'label') "
|
||
f"and contains(normalize-space(.), '{row_label}')]]"
|
||
)))
|
||
except TimeoutException:
|
||
logger.debug("row '%s' not present (skipped)", row_label)
|
||
return False
|
||
|
||
buttons = row.find_elements(By.CSS_SELECTOR, "button.option, button")
|
||
for btn in buttons:
|
||
txt = (btn.text or '').strip()
|
||
if button_text in txt:
|
||
try:
|
||
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn)
|
||
time.sleep(0.3)
|
||
btn.click()
|
||
logger.info("clicked '%s' in row '%s'", button_text, row_label)
|
||
time.sleep(1.2)
|
||
return True
|
||
except Exception as e:
|
||
logger.debug("click failed in row '%s' / '%s': %s", row_label, button_text, e)
|
||
return False
|
||
logger.debug("button '%s' not found in row '%s'", button_text, row_label)
|
||
return False
|
||
|
||
|
||
def expand_sheet_container(driver, container):
|
||
"""把 sheet-container 跟它的祖先一起把 overflow / max-height 拆掉,
|
||
让 scrollHeight 全暴露,截图能拿到完整谱面。"""
|
||
return driver.execute_script("""
|
||
var c = arguments[0];
|
||
var origStyle = c.getAttribute('style') || '';
|
||
var modified = [];
|
||
var node = c;
|
||
while (node && node !== document.body) {
|
||
var cs = window.getComputedStyle(node);
|
||
if (cs.overflow === 'hidden' || cs.overflow === 'auto'
|
||
|| cs.overflowY === 'hidden' || cs.overflowY === 'auto'
|
||
|| cs.maxHeight !== 'none') {
|
||
modified.push({ el: node, orig: node.getAttribute('style') || '' });
|
||
node.style.overflow = 'visible';
|
||
node.style.overflowY = 'visible';
|
||
node.style.maxHeight = 'none';
|
||
node.style.height = 'auto';
|
||
}
|
||
node = node.parentElement;
|
||
}
|
||
c.style.overflow = 'visible';
|
||
c.style.maxHeight = 'none';
|
||
c.style.height = 'auto';
|
||
c.style.minHeight = c.scrollHeight + 'px';
|
||
c.offsetHeight; // force reflow
|
||
c.setAttribute('data-orig-style', origStyle);
|
||
window.__yopuModified = modified;
|
||
return { scrollHeight: c.scrollHeight, modified: modified.length };
|
||
""", container)
|
||
|
||
|
||
def crop_white(path, pad_top=20, pad_bottom=50, pad_left=20, pad_right=20, white_th=250):
|
||
"""裁掉四边的白边,加点 padding。"""
|
||
img = Image.open(path)
|
||
w, h = img.size
|
||
if img.mode != 'RGB':
|
||
img = img.convert('RGB')
|
||
px = img.load()
|
||
|
||
def row_white_ratio(y):
|
||
wp = 0
|
||
for x in range(w):
|
||
r, g, b = px[x, y]
|
||
if r > white_th and g > white_th and b > white_th:
|
||
wp += 1
|
||
return wp / w
|
||
|
||
def col_white_ratio(x, y0, y1):
|
||
wp = 0
|
||
rng = max(1, y1 - y0)
|
||
for y in range(y0, y1):
|
||
r, g, b = px[x, y]
|
||
if r > white_th and g > white_th and b > white_th:
|
||
wp += 1
|
||
return wp / rng
|
||
|
||
top = 0
|
||
for y in range(h):
|
||
if row_white_ratio(y) < 0.99:
|
||
top = y
|
||
break
|
||
bottom = h
|
||
for y in range(h - 1, -1, -1):
|
||
if row_white_ratio(y) < 0.99:
|
||
bottom = y + 1
|
||
break
|
||
if top >= bottom:
|
||
return # all white, give up
|
||
|
||
left = 0
|
||
for x in range(w):
|
||
if col_white_ratio(x, top, bottom) < 0.99:
|
||
left = x
|
||
break
|
||
right = w
|
||
for x in range(w - 1, -1, -1):
|
||
if col_white_ratio(x, top, bottom) < 0.99:
|
||
right = x + 1
|
||
break
|
||
if left >= right:
|
||
return
|
||
|
||
box = (
|
||
max(0, left - pad_left),
|
||
max(0, top - pad_top),
|
||
min(w, right + pad_right),
|
||
min(h, bottom + pad_bottom),
|
||
)
|
||
img.crop(box).save(path, 'PNG')
|
||
logger.info("cropped to %s", box)
|
||
|
||
|
||
DEBUG_DIR = Path('/data/chord-debug')
|
||
|
||
|
||
def _save_debug(driver, tag: str):
|
||
"""失败时 dump 当前 HTML + 截图到 /data/chord-debug 方便排查。"""
|
||
try:
|
||
DEBUG_DIR.mkdir(parents=True, exist_ok=True)
|
||
ts = int(time.time())
|
||
(DEBUG_DIR / f'{tag}-{ts}.html').write_text(driver.page_source, encoding='utf-8')
|
||
driver.save_screenshot(str(DEBUG_DIR / f'{tag}-{ts}.png'))
|
||
logger.info("debug snapshot saved: %s/%s-%d.{html,png}", DEBUG_DIR, tag, ts)
|
||
except Exception as e:
|
||
logger.warning("debug snapshot failed: %s", e)
|
||
|
||
|
||
def fetch_chord_chart(query: str, output_path: str, *,
|
||
sheet_style: str = '功能谱',
|
||
chord_style: str = '级数名',
|
||
verbose: bool = False) -> tuple[bool, str]:
|
||
"""
|
||
搜 yopu.co、进 view 页、按 row 选样式、截图。
|
||
返回 (ok, msg)。msg 在失败时是错误说明。
|
||
"""
|
||
if verbose:
|
||
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
|
||
else:
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
|
||
|
||
search_url = f"https://yopu.co/explore#q={quote(query)}"
|
||
driver = None
|
||
try:
|
||
driver = setup_driver()
|
||
result = find_first_chord_chart(driver, search_url)
|
||
if not result:
|
||
_save_debug(driver, 'no-search-hit')
|
||
return False, '未找到和弦谱'
|
||
view_url = result['url']
|
||
|
||
logger.info("loading view: %s", view_url)
|
||
driver.get(view_url)
|
||
time.sleep(3)
|
||
|
||
# 旧 yopu UI 在 view 页有「谱面样式 / 和弦样式」row 可切;
|
||
# 新 yopu 已经下线了这些(要登录 APP 才能切),所以用搜索阶段
|
||
# 选「功能谱」版本绕过去。这里 best-effort 试一下,找不到不算错误。
|
||
select_option_in_row(driver, '谱面样式', sheet_style)
|
||
select_option_in_row(driver, '和弦样式', chord_style)
|
||
|
||
# 等内容刷新
|
||
time.sleep(1.5)
|
||
|
||
wait = WebDriverWait(driver, 15)
|
||
try:
|
||
sheet = wait.until(EC.presence_of_element_located(
|
||
(By.CSS_SELECTOR, "div.sheet-container")
|
||
))
|
||
except TimeoutException:
|
||
_save_debug(driver, 'no-sheet-container')
|
||
raise
|
||
|
||
driver.execute_script("arguments[0].scrollIntoView(true);", sheet)
|
||
time.sleep(0.5)
|
||
|
||
dims = expand_sheet_container(driver, sheet)
|
||
logger.debug("expanded scrollHeight=%s, modified=%s ancestors", dims['scrollHeight'], dims['modified'])
|
||
time.sleep(1.5)
|
||
|
||
# incrButton:放大字号 / chord size,跟旧版一样点 3 次
|
||
try:
|
||
buttons = driver.find_elements(By.CSS_SELECTOR, "button.incrButton")
|
||
if buttons:
|
||
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", buttons[0])
|
||
time.sleep(0.3)
|
||
for _ in range(3):
|
||
buttons[0].click()
|
||
time.sleep(0.4)
|
||
except Exception as e:
|
||
logger.warning("incrButton failed: %s", e)
|
||
|
||
time.sleep(1.0)
|
||
|
||
# 滚 sheet 内部回到顶部,截整个 container
|
||
driver.execute_script("arguments[0].scrollTop = 0;", sheet)
|
||
time.sleep(0.4)
|
||
|
||
out = Path(output_path)
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
sheet.screenshot(str(out))
|
||
if not out.exists() or out.stat().st_size < 100:
|
||
return False, '截图为空'
|
||
logger.info("screenshot: %s (%d bytes)", out, out.stat().st_size)
|
||
|
||
try:
|
||
crop_white(str(out))
|
||
except Exception as e:
|
||
logger.warning("crop failed: %s", e)
|
||
|
||
return True, str(out)
|
||
except Exception as e:
|
||
logger.error("fetch failed: %s", e, exc_info=True)
|
||
return False, str(e)
|
||
finally:
|
||
if driver:
|
||
try:
|
||
driver.quit()
|
||
except Exception:
|
||
pass
|