#!/usr/bin/env python3 """ yopu.co 和弦谱抓取(v2) 跟旧 guitar 版相比,UI 改了:现在是分立的 row: - "谱面样式" → 选 "功能谱" - "和弦样式" → 选 "级数名" - "和弦图" → 默认(不动) 抓取流程: 1. /explore#q= 搜索 2. 找第一个含「和弦谱」字样的结果 → 进 /view/ 3. 在 row label = X 的行里,点 button.option 文本 = Y 4. 撑开 div.sheet-container 容器把 overflow / max-height 砍掉,让全部内容渲染 5. 截图整个 container element 6. PIL 裁白边 + padding,存 PNG """ import os import time import logging from pathlib import Path from urllib.parse import quote, urlparse, urljoin from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import TimeoutException from PIL import Image logger = logging.getLogger(__name__) def setup_driver(window="1920,5000"): o = Options() o.add_argument('--headless=new') o.add_argument('--no-sandbox') o.add_argument('--disable-dev-shm-usage') o.add_argument('--disable-gpu') o.add_argument(f'--window-size={window}') o.add_argument('--lang=zh-CN') o.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36') o.add_experimental_option('prefs', {'intl.accept_languages': 'zh-CN,zh,en-US,en'}) service = None if cdp := os.getenv('CHROMEDRIVER_PATH'): service = Service(cdp) if cb := os.getenv('CHROME_BIN'): o.binary_location = cb return webdriver.Chrome(service=service, options=o) def find_first_chord_chart(driver, search_url): """在搜索页找第一个「和弦谱」结果,返回 view url 和 title。""" logger.info("loading search: %s", search_url) driver.get(search_url) time.sleep(3) chord_links = driver.execute_script(""" var hits = []; var posts = document.querySelectorAll('a.post-main'); for (var i = 0; i < posts.length; i++) { var info = posts[i].querySelector('.one-line-info'); var t = info ? (info.textContent || info.innerText || '') : ''; if (t.indexOf('和弦') >= 0 && t.indexOf('谱') >= 0) { hits.push({ href: posts[i].href, title: (posts[i].querySelector('.title-line .title, .title') || {}).textContent || '', text: t.trim(), }); } } return hits; """) if not chord_links: logger.warning("no '和弦谱' hits in search results") return None first = chord_links[0] href = first['href'] if href.startswith('/'): p = urlparse(search_url) href = f"{p.scheme}://{p.netloc}{href}" elif not href.startswith('http'): href = urljoin(search_url, href) logger.info("matched: %s — %s", first.get('title'), href) return {'url': href, 'title': first.get('title') or '', 'text': first.get('text') or ''} def select_option_in_row(driver, row_label, button_text, timeout=10): """在 label 含 row_label 的 row 里,点 button.option 文本含 button_text 的按钮。 返回 True 表示点了;False 表示找不到(不算错误,可能是 UI 文案变了)。""" wait = WebDriverWait(driver, timeout) try: row = wait.until(EC.presence_of_element_located(( By.XPATH, f"//div[contains(@class, 'row')][.//div[contains(@class, 'label') " f"and contains(normalize-space(.), '{row_label}')]]" ))) except TimeoutException: logger.warning("row '%s' not found", row_label) return False buttons = row.find_elements(By.CSS_SELECTOR, "button.option, button") for btn in buttons: txt = (btn.text or '').strip() if button_text in txt: try: driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn) time.sleep(0.3) btn.click() logger.info("clicked '%s' in row '%s'", button_text, row_label) time.sleep(1.2) return True except Exception as e: logger.warning("click failed in row '%s' / '%s': %s", row_label, button_text, e) return False logger.warning("button '%s' not found in row '%s' (had: %s)", button_text, row_label, [(b.text or '').strip() for b in buttons]) return False def expand_sheet_container(driver, container): """把 sheet-container 跟它的祖先一起把 overflow / max-height 拆掉, 让 scrollHeight 全暴露,截图能拿到完整谱面。""" return driver.execute_script(""" var c = arguments[0]; var origStyle = c.getAttribute('style') || ''; var modified = []; var node = c; while (node && node !== document.body) { var cs = window.getComputedStyle(node); if (cs.overflow === 'hidden' || cs.overflow === 'auto' || cs.overflowY === 'hidden' || cs.overflowY === 'auto' || cs.maxHeight !== 'none') { modified.push({ el: node, orig: node.getAttribute('style') || '' }); node.style.overflow = 'visible'; node.style.overflowY = 'visible'; node.style.maxHeight = 'none'; node.style.height = 'auto'; } node = node.parentElement; } c.style.overflow = 'visible'; c.style.maxHeight = 'none'; c.style.height = 'auto'; c.style.minHeight = c.scrollHeight + 'px'; c.offsetHeight; // force reflow c.setAttribute('data-orig-style', origStyle); window.__yopuModified = modified; return { scrollHeight: c.scrollHeight, modified: modified.length }; """, container) def crop_white(path, pad_top=20, pad_bottom=50, pad_left=20, pad_right=20, white_th=250): """裁掉四边的白边,加点 padding。""" img = Image.open(path) w, h = img.size if img.mode != 'RGB': img = img.convert('RGB') px = img.load() def row_white_ratio(y): wp = 0 for x in range(w): r, g, b = px[x, y] if r > white_th and g > white_th and b > white_th: wp += 1 return wp / w def col_white_ratio(x, y0, y1): wp = 0 rng = max(1, y1 - y0) for y in range(y0, y1): r, g, b = px[x, y] if r > white_th and g > white_th and b > white_th: wp += 1 return wp / rng top = 0 for y in range(h): if row_white_ratio(y) < 0.99: top = y break bottom = h for y in range(h - 1, -1, -1): if row_white_ratio(y) < 0.99: bottom = y + 1 break if top >= bottom: return # all white, give up left = 0 for x in range(w): if col_white_ratio(x, top, bottom) < 0.99: left = x break right = w for x in range(w - 1, -1, -1): if col_white_ratio(x, top, bottom) < 0.99: right = x + 1 break if left >= right: return box = ( max(0, left - pad_left), max(0, top - pad_top), min(w, right + pad_right), min(h, bottom + pad_bottom), ) img.crop(box).save(path, 'PNG') logger.info("cropped to %s", box) def fetch_chord_chart(query: str, output_path: str, *, sheet_style: str = '功能谱', chord_style: str = '级数名', verbose: bool = False) -> tuple[bool, str]: """ 搜 yopu.co、进 view 页、按 row 选样式、截图。 返回 (ok, msg)。msg 在失败时是错误说明。 """ if verbose: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') else: logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') search_url = f"https://yopu.co/explore#q={quote(query)}" driver = None try: driver = setup_driver() result = find_first_chord_chart(driver, search_url) if not result: return False, '未找到和弦谱' view_url = result['url'] logger.info("loading view: %s", view_url) driver.get(view_url) time.sleep(3) # 选样式(写死的 MVP 组合) select_option_in_row(driver, '谱面样式', sheet_style) select_option_in_row(driver, '和弦样式', chord_style) # 等内容刷新 time.sleep(1.5) wait = WebDriverWait(driver, 15) sheet = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, "div.sheet-container") )) driver.execute_script("arguments[0].scrollIntoView(true);", sheet) time.sleep(0.5) dims = expand_sheet_container(driver, sheet) logger.debug("expanded scrollHeight=%s, modified=%s ancestors", dims['scrollHeight'], dims['modified']) time.sleep(1.5) # incrButton:放大字号 / chord size,跟旧版一样点 3 次 try: buttons = driver.find_elements(By.CSS_SELECTOR, "button.incrButton") if buttons: driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", buttons[0]) time.sleep(0.3) for _ in range(3): buttons[0].click() time.sleep(0.4) except Exception as e: logger.warning("incrButton failed: %s", e) time.sleep(1.0) # 滚 sheet 内部回到顶部,截整个 container driver.execute_script("arguments[0].scrollTop = 0;", sheet) time.sleep(0.4) out = Path(output_path) out.parent.mkdir(parents=True, exist_ok=True) sheet.screenshot(str(out)) if not out.exists() or out.stat().st_size < 100: return False, '截图为空' logger.info("screenshot: %s (%d bytes)", out, out.stat().st_size) try: crop_white(str(out)) except Exception as e: logger.warning("crop failed: %s", e) return True, str(out) except Exception as e: logger.error("fetch failed: %s", e, exc_info=True) return False, str(e) finally: if driver: try: driver.quit() except Exception: pass