Compare commits

1 Commits

Author SHA1 Message Date
RainySY
4412df0949 chore: snapshot backup before rainycy push (20260624-032434)
Auto-committed by MiMo for migration to git.rainycy.top
2026-06-24 03:26:19 +08:00

621
main.py
View File

@@ -58,6 +58,7 @@ class LinuxDoPreviewPlugin(Star):
# 登录状态:跨 fetch 复用在同一 StealthySession 中 # 登录状态:跨 fetch 复用在同一 StealthySession 中
self._auth_check_done = False self._auth_check_done = False
self._logged_in = False self._logged_in = False
self._auth_lock = threading.Lock()
async def terminate(self): async def terminate(self):
_EXECUTOR.shutdown(wait=False) _EXECUTOR.shutdown(wait=False)
@@ -203,173 +204,175 @@ class LinuxDoPreviewPlugin(Star):
return None return None
page = ctx.new_page() page = ctx.new_page()
page.set_viewport_size({"width": 1280, "height": 900})
# ── 导航:等 networkidle 确保 JS 动态内容加载完成 ──
page.goto(url, wait_until="networkidle", timeout=timeout_ms)
# ── 等待 Discourse 帖子内容渲染 ──
try: try:
page.wait_for_selector("#post_1", timeout=min(timeout_ms, 10000)) page.set_viewport_size({"width": 1280, "height": 900})
except Exception:
page.wait_for_timeout(3000) # 回退:固定等待
# ── 隐藏非楼主内容,只保留第一篇帖子完整展示 ── # ── 导航:等 networkidle 确保 JS 动态内容加载完成 ──
page.evaluate("""() => { page.goto(url, wait_until="networkidle", timeout=timeout_ms)
const hide = (sel) => {
const el = document.querySelector(sel);
if (el) el.style.display = 'none';
};
hide('.d-header'); // 顶部导航栏
hide('.sidebar-wrapper'); // 左侧边栏
hide('.topic-navigation-wrapper'); // 帖子导航条
hide('.footer-nav.visible'); // 底部导航
hide('.post-stream'); // 隐藏整个帖子流(后面单独显示楼主)
// 隐藏所有回复帖子,只保留楼主 # ── 等待 Discourse 帖子内容渲染 ──
const posts = document.querySelectorAll('.topic-post'); try:
posts.forEach((post, i) => { if (i > 0) post.style.display = 'none'; }); page.wait_for_selector("#post_1", timeout=min(timeout_ms, 10000))
except Exception:
page.wait_for_timeout(3000) # 回退:固定等待
// 滚动到顶部 # ── 隐藏非楼主内容,只保留第一篇帖子完整展示 ──
window.scrollTo(0, 0); page.evaluate("""() => {
}""") const hide = (sel) => {
const el = document.querySelector(sel);
if (el) el.style.display = 'none';
};
hide('.d-header'); // 顶部导航栏
hide('.sidebar-wrapper'); // 左侧边栏
hide('.topic-navigation-wrapper'); // 帖子导航条
hide('.footer-nav.visible'); // 底部导航
hide('.post-stream'); // 隐藏整个帖子流(后面单独显示楼主)
# ── 展开 Discourse 截断的长帖 ── // 隐藏所有回复帖子,只保留楼主
page.evaluate("""() => { const posts = document.querySelectorAll('.topic-post');
// 移除所有展开按钮和截断遮罩 posts.forEach((post, i) => { if (i > 0) post.style.display = 'none'; });
const removeSelectors = [
'.expand-post',
'.gap-bottom',
'.gap',
'.large-post-container .show-more',
'.topic-body .show-more',
'.cooked .show-more',
'.lightbox',
];
removeSelectors.forEach(sel => {
document.querySelectorAll(sel).forEach(el => el.remove());
});
// 移除所有 max-height / overflow 限制 // 滚动到顶部
const unclampSelectors = [ window.scrollTo(0, 0);
'.cooked', }""")
'.topic-body',
'#post_1 .cooked', # ── 展开 Discourse 截断的长帖 ──
'#post_1 .topic-body', page.evaluate("""() => {
'#post_1 .contents', // 移除所有展开按钮和截断遮罩
'.large-post-container', const removeSelectors = [
]; '.expand-post',
unclampSelectors.forEach(sel => { '.gap-bottom',
document.querySelectorAll(sel).forEach(el => { '.gap',
'.large-post-container .show-more',
'.topic-body .show-more',
'.cooked .show-more',
'.lightbox',
];
removeSelectors.forEach(sel => {
document.querySelectorAll(sel).forEach(el => el.remove());
});
// 移除所有 max-height / overflow 限制
const unclampSelectors = [
'.cooked',
'.topic-body',
'#post_1 .cooked',
'#post_1 .topic-body',
'#post_1 .contents',
'.large-post-container',
];
unclampSelectors.forEach(sel => {
document.querySelectorAll(sel).forEach(el => {
el.style.maxHeight = 'none';
el.style.overflow = 'visible';
el.style.height = 'auto';
});
});
// 展开 Discourse 长帖截断data-* 属性方式)
document.querySelectorAll('[data-expanded]').forEach(el => {
el.setAttribute('data-expanded', 'true');
});
// 移除 truncated 标记
document.querySelectorAll('.truncated').forEach(el => {
el.classList.remove('truncated');
});
}""")
# ── 点击可能存在的展开按钮 ──
try:
expand_buttons = page.query_selector_all(
'#post_1 .expand-post, #post_1 .show-more, '
'#post_1 button[class*="expand"], '
'#post_1 a[class*="expand"]'
)
for btn in expand_buttons:
try:
btn.click()
page.wait_for_timeout(300)
except Exception:
pass
except Exception:
pass
# ── 再次展开,防止点击按钮后重新截断 ──
page.evaluate("""() => {
['#post_1 .cooked', '#post_1 .topic-body', '#post_1 .contents'].forEach(sel => {
document.querySelectorAll(sel).forEach(el => {
el.style.maxHeight = 'none';
el.style.overflow = 'visible';
el.style.height = 'auto';
});
});
// 确保图片容器也不截断
document.querySelectorAll('#post_1 .lightbox-wrapper').forEach(el => {
el.style.maxHeight = 'none'; el.style.maxHeight = 'none';
el.style.overflow = 'visible'; el.style.overflow = 'visible';
el.style.height = 'auto';
}); });
}); }""")
// 展开 Discourse 长帖截断data-* 属性方式) # ── 滚动楼主帖子,触发懒加载图片 ──
document.querySelectorAll('[data-expanded]').forEach(el => { post1_box = page.evaluate("""() => {
el.setAttribute('data-expanded', 'true'); const p1 = document.querySelector('#post_1');
}); if (!p1) return null;
// 移除 truncated 标记 const rect = p1.getBoundingClientRect();
document.querySelectorAll('.truncated').forEach(el => { return { top: rect.top + window.scrollY, height: rect.height };
el.classList.remove('truncated'); }""")
}); if post1_box:
}""") post_top = int(post1_box.get('top', 0))
post_height = int(post1_box.get('height', 0))
for y in range(post_top, post_top + post_height, 400):
page.evaluate(f"window.scrollTo(0, {y})")
page.wait_for_timeout(200)
else:
# 回退:滚动整个页面
total_height = page.evaluate("document.body.scrollHeight")
for y in range(0, total_height, 400):
page.evaluate(f"window.scrollTo(0, {y})")
page.wait_for_timeout(200)
# ── 点击可能存在的展开按钮 ── # ── 等待图片加载完成 ──
try: page.evaluate("""() => {
expand_buttons = page.query_selector_all( return new Promise(resolve => {
'#post_1 .expand-post, #post_1 .show-more, ' const imgs = document.querySelectorAll('#post_1 img');
'#post_1 button[class*="expand"], ' let loaded = 0;
'#post_1 a[class*="expand"]' const total = imgs.length;
) if (total === 0) return resolve();
for btn in expand_buttons: imgs.forEach(img => {
try: if (img.complete) {
btn.click()
page.wait_for_timeout(300)
except Exception:
pass
except Exception:
pass
# ── 再次展开,防止点击按钮后重新截断 ──
page.evaluate("""() => {
['#post_1 .cooked', '#post_1 .topic-body', '#post_1 .contents'].forEach(sel => {
document.querySelectorAll(sel).forEach(el => {
el.style.maxHeight = 'none';
el.style.overflow = 'visible';
el.style.height = 'auto';
});
});
// 确保图片容器也不截断
document.querySelectorAll('#post_1 .lightbox-wrapper').forEach(el => {
el.style.maxHeight = 'none';
el.style.overflow = 'visible';
});
}""")
# ── 滚动楼主帖子,触发懒加载图片 ──
post1_box = page.evaluate("""() => {
const p1 = document.querySelector('#post_1');
if (!p1) return null;
const rect = p1.getBoundingClientRect();
return { top: rect.top + window.scrollY, height: rect.height };
}""")
if post1_box:
post_top = int(post1_box.get('top', 0))
post_height = int(post1_box.get('height', 0))
for y in range(post_top, post_top + post_height, 400):
page.evaluate(f"window.scrollTo(0, {y})")
page.wait_for_timeout(200)
else:
# 回退:滚动整个页面
total_height = page.evaluate("document.body.scrollHeight")
for y in range(0, total_height, 400):
page.evaluate(f"window.scrollTo(0, {y})")
page.wait_for_timeout(200)
# ── 等待图片加载完成 ──
page.evaluate("""() => {
return new Promise(resolve => {
const imgs = document.querySelectorAll('#post_1 img');
let loaded = 0;
const total = imgs.length;
if (total === 0) return resolve();
imgs.forEach(img => {
if (img.complete) {
loaded++;
if (loaded >= total) resolve();
} else {
img.onload = img.onerror = () => {
loaded++; loaded++;
if (loaded >= total) resolve(); if (loaded >= total) resolve();
}; } else {
} img.onload = img.onerror = () => {
loaded++;
if (loaded >= total) resolve();
};
}
});
// 最多等 3 秒
setTimeout(resolve, 3000);
}); });
// 最多等 3 秒 }""")
setTimeout(resolve, 3000);
});
}""")
# ── 滚动回顶部 ── # ── 滚动回顶部 ──
page.evaluate("window.scrollTo(0, 0)") page.evaluate("window.scrollTo(0, 0)")
page.wait_for_timeout(500) page.wait_for_timeout(500)
# ── 截图:全页模式,隐藏导航栏后内容干净 ── # ── 截图:全页模式,隐藏导航栏后内容干净 ──
full_page = self.config.get("screenshot_full_page", True) full_page = self.config.get("screenshot_full_page", True)
page.screenshot( page.screenshot(
path=str(save_path), path=str(save_path),
full_page=full_page, full_page=full_page,
timeout=timeout_ms, timeout=timeout_ms,
) )
sz = save_path.stat().st_size sz = save_path.stat().st_size
logger.info( logger.info(
f"[LinuxDoPreview] 截图保存: {save_path.name} ({sz / 1024:.1f} KB)" f"[LinuxDoPreview] 截图保存: {save_path.name} ({sz / 1024:.1f} KB)"
) )
page.close() return save_path
return save_path finally:
page.close()
except Exception as e: except Exception as e:
logger.warning(f"[LinuxDoPreview] 截图失败: {type(e).__name__}: {e}") logger.warning(f"[LinuxDoPreview] 截图失败: {type(e).__name__}: {e}")
@@ -377,54 +380,56 @@ class LinuxDoPreviewPlugin(Star):
# ─────────── 文本提取 ─────────── # ─────────── 文本提取 ───────────
@staticmethod
def _build_json_url(url: str) -> str:
"""将 topic URL 转换为对应的 JSON API URL"""
json_url = url.rstrip('/')
if not json_url.endswith('.json'):
parts = json_url.split('/')
t_idx = -1
for i, p in enumerate(parts):
if p == 't':
t_idx = i
break
if t_idx >= 0 and len(parts) > t_idx + 2:
json_url = '/'.join(parts[:t_idx + 3])
json_url += '.json'
return json_url
def _extract_content_from_json(self, session, url: str) -> str: def _extract_content_from_json(self, session, url: str) -> str:
"""通过 Discourse JSON API 获取完整的楼主帖子内容 """通过 Discourse JSON API 获取完整的楼主帖子内容
Discourse 的 .json 端点返回结构化数据,包含完整的 cooked HTML Discourse 的 .json 端点返回结构化数据,包含完整的 cooked HTML
不受页面截断、懒加载或 Cloudflare 渲染问题的影响。 不受页面截断、懒加载或 Cloudflare 渲染问题的影响。
""" """
try: try:
# 构造 JSON URLtopic-url.json 或 topic-url/1.json json_url = self._build_json_url(url)
json_url = url.rstrip('/')
if not json_url.endswith('.json'):
# 对于帖子链接如 /t/topic-slug/12345/5取前两段
parts = json_url.split('/')
# 找到 /t/ 后的部分
t_idx = -1
for i, p in enumerate(parts):
if p == 't':
t_idx = i
break
if t_idx >= 0 and len(parts) > t_idx + 2:
# 重建为 /t/slug/id 格式
json_url = '/'.join(parts[:t_idx + 3])
json_url += '.json'
logger.info(f"[LinuxDoPreview] JSON API 请求: {json_url}") logger.info(f"[LinuxDoPreview] JSON API 请求: {json_url}")
resp = session.fetch(json_url) resp = session.fetch(json_url)
if resp.status != 200: if resp.status != 200:
logger.info(f"[LinuxDoPreview] JSON API 返回 {resp.status}") logger.info(f"[LinuxDoPreview] JSON API 返回 {resp.status}")
return "" return ""
import json import json
data = json.loads(resp.body.decode("utf-8", errors="replace")) data = json.loads(resp.body.decode("utf-8", errors="replace"))
# 从 post_stream 中提取第一个帖子(楼主) # 从 post_stream 中提取第一个帖子(楼主)
post_stream = data.get("post_stream", {}) post_stream = data.get("post_stream", {})
posts = post_stream.get("posts", []) posts = post_stream.get("posts", [])
if not posts: if not posts:
return "" return ""
first_post = posts[0] first_post = posts[0]
cooked_html = first_post.get("cooked", "") cooked_html = first_post.get("cooked", "")
if not cooked_html: if not cooked_html:
return "" return ""
# 使用 lxml 解析 HTML 并提取纯文本 # 使用 lxml 解析 HTML 并提取纯文本
if _lh is not None: if _lh is not None:
tree = _lh.fromstring(cooked_html) tree = _lh.fromstring(cooked_html)
return _clean_text(tree.text_content()) return _clean_text(tree.text_content())
# 回退:正则去标签 # 回退:正则去标签
text = re.sub(r"<[^>]+>", " ", cooked_html) text = re.sub(r"<[^>]+>", " ", cooked_html)
text = re.sub(r"\s+", " ", text).strip() text = re.sub(r"\s+", " ", text).strip()
@@ -567,25 +572,34 @@ class LinuxDoPreviewPlugin(Star):
if self._has_session_cookie(): if self._has_session_cookie():
cookie_value = (self.config.get("linuxdo_session_cookie", "") or "").strip() cookie_value = (self.config.get("linuxdo_session_cookie", "") or "").strip()
if not self._inject_session_cookie(session, cookie_value): if not self._inject_session_cookie(session, cookie_value):
self._auth_check_done = True with self._auth_lock:
self._logged_in = False self._auth_check_done = True
self._logged_in = False
return False return False
# 校验结果只算一次Cookie 有效性跨请求稳定) # 校验结果只算一次Cookie 有效性跨请求稳定)
if not self._auth_check_done: with self._auth_lock:
self._logged_in = self._check_login_state(session) need_check = not self._auth_check_done
self._auth_check_done = True if need_check:
if self._logged_in: logged_in = self._check_login_state(session)
with self._auth_lock:
self._logged_in = logged_in
self._auth_check_done = True
if logged_in:
logger.info("[LinuxDoPreview] Cookie 登录验证成功") logger.info("[LinuxDoPreview] Cookie 登录验证成功")
else: else:
logger.warning( logger.warning(
"[LinuxDoPreview] 会话 Cookie 无效或已过期,将匿名访问。" "[LinuxDoPreview] 会话 Cookie 无效或已过期,将匿名访问。"
"请在浏览器重新获取 Cookie推荐 _t长效后填入配置。" "请在浏览器重新获取 Cookie推荐 _t长效后填入配置。"
) )
return self._logged_in with self._auth_lock:
return self._logged_in
# ── 仅用户名/密码:受 hCaptcha 限制,无法自动登录(仅提示一次) ── # ── 仅用户名/密码:受 hCaptcha 限制,无法自动登录(仅提示一次) ──
if self._has_auto_login() and not self._auth_check_done: with self._auth_lock:
self._auth_check_done = True need_warn = self._has_auto_login() and not self._auth_check_done
if need_warn:
self._auth_check_done = True
if need_warn:
logger.warning( logger.warning(
"[LinuxDoPreview] linux.do 登录启用了 hCaptcha 人机验证,账号密码" "[LinuxDoPreview] linux.do 登录启用了 hCaptcha 人机验证,账号密码"
"自动登录不可用。请在浏览器登录 linux.do 后F12 → Application → " "自动登录不可用。请在浏览器登录 linux.do 后F12 → Application → "
@@ -594,27 +608,18 @@ class LinuxDoPreviewPlugin(Star):
) )
# 都没配置 / 自动登录不可用 → 匿名 # 都没配置 / 自动登录不可用 → 匿名
self._logged_in = False with self._auth_lock:
self._logged_in = False
return False return False
def _fetch_topic_data(self, session, url: str) -> dict | None: def _fetch_topic_data(self, session, url: str) -> dict | None:
"""通过 Discourse JSON API 获取完整的主题数据 """通过 Discourse JSON API 获取完整的主题数据
返回的 dict 包含帖子原始数据cooked HTML、作者、标签、统计等 返回的 dict 包含帖子原始数据cooked HTML、作者、标签、统计等
可同时供文本提取和自定义 HTML 渲染使用。 可同时供文本提取和自定义 HTML 渲染使用。
""" """
try: try:
json_url = url.rstrip('/') json_url = self._build_json_url(url)
if not json_url.endswith('.json'):
parts = json_url.split('/')
t_idx = -1
for i, p in enumerate(parts):
if p == 't':
t_idx = i
break
if t_idx >= 0 and len(parts) > t_idx + 2:
json_url = '/'.join(parts[:t_idx + 3])
json_url += '.json'
logger.info(f"[LinuxDoPreview] 拉取 topic JSON: {json_url}") logger.info(f"[LinuxDoPreview] 拉取 topic JSON: {json_url}")
resp = session.fetch(json_url) resp = session.fetch(json_url)
@@ -749,7 +754,6 @@ class LinuxDoPreviewPlugin(Star):
posts_count = topic_data.get("posts_count", 0) posts_count = topic_data.get("posts_count", 0)
views = topic_data.get("views", 0) views = topic_data.get("views", 0)
like_count = topic_data.get("like_count", 0) like_count = topic_data.get("like_count", 0)
created_at = topic_data.get("created_at", "")
tags = topic_data.get("tags", []) or [] tags = topic_data.get("tags", []) or []
post_stream = topic_data.get("post_stream", {}) or {} post_stream = topic_data.get("post_stream", {}) or {}
@@ -774,12 +778,14 @@ class LinuxDoPreviewPlugin(Star):
if "{size}" in author_avatar: if "{size}" in author_avatar:
author_avatar = author_avatar.replace("{size}", "120") author_avatar = author_avatar.replace("{size}", "120")
post_created = first.get("created_at", "") or "" post_created = first.get("created_at", "") or ""
post_like = first.get("like_count", 0)
cooked_html = first.get("cooked", "") or "" cooked_html = first.get("cooked", "") or ""
# 把 Discourse 相对资源 URL 补全为绝对 URL # 把 Discourse 相对资源 URL 补全为绝对 URL
cooked_html = self._normalize_cooked_urls(cooked_html) cooked_html = self._normalize_cooked_urls(cooked_html)
# 消毒 HTML移除危险标签和事件属性
cooked_html = self._sanitize_html(cooked_html)
# 发布时间格式化 # 发布时间格式化
created_text = "" created_text = ""
if post_created: if post_created:
@@ -981,6 +987,27 @@ class LinuxDoPreviewPlugin(Star):
return f"{n/1000:.1f}k" return f"{n/1000:.1f}k"
return str(n) return str(n)
@staticmethod
def _sanitize_html(html_str: str) -> str:
"""消毒 HTML移除 <script>、<iframe>、<object>、<embed> 及 on* 事件属性"""
if not html_str:
return ""
html_str = re.sub(
r'<\s*(script|iframe|object|embed|applet|form|input|textarea|button|select)\b[^>]*>.*?<\s*/\s*\1\s*>',
'', html_str, flags=re.IGNORECASE | re.DOTALL
)
html_str = re.sub(
r'<\s*(script|iframe|object|embed|applet|form|input|textarea|button|select)\b[^>]*/?\s*>',
'', html_str, flags=re.IGNORECASE
)
html_str = re.sub(r'\s+on\w+\s*=\s*"[^"]*"', '', html_str, flags=re.IGNORECASE)
html_str = re.sub(r"\s+on\w+\s*=\s*'[^']*'", '', html_str, flags=re.IGNORECASE)
html_str = re.sub(r'\s+on\w+\s*=\s*[^\s>]+', '', html_str, flags=re.IGNORECASE)
html_str = re.sub(
r'href\s*=\s*["\']?\s*javascript:', 'href="#', html_str, flags=re.IGNORECASE
)
return html_str
@staticmethod @staticmethod
def _normalize_cooked_urls(cooked_html: str) -> str: def _normalize_cooked_urls(cooked_html: str) -> str:
"""将 cooked 中的相对资源 URL 转绝对 URL剥离轻臾框包裹与 meta 信息 """将 cooked 中的相对资源 URL 转绝对 URL剥离轻臾框包裹与 meta 信息
@@ -1000,73 +1027,63 @@ class LinuxDoPreviewPlugin(Star):
if not cooked_html: if not cooked_html:
return "" return ""
try: try:
import re as _re
# 1) 绝对化 src/href相对与协议无关 URL # 1) 绝对化 src/href相对与协议无关 URL
cooked_html = _re.sub( cooked_html = re.sub(
r'(src|href)="(//[^"]+)"', r'(src|href)="(//[^"]+)"',
r'\1="https:\2', r'\1="https:\2',
cooked_html, cooked_html,
) )
cooked_html = _re.sub( cooked_html = re.sub(
r'(src|href)="(/uploads/[^"]+)"', r'(src|href)="(/uploads/[^"]+)"',
r'\1="https://linux.do\2', r'\1="https://linux.do\2',
cooked_html, cooked_html,
) )
# 2) 整块剥离 lightbox-wrapper仅保留内部 <img>,丢弃其余 # 2) 使用 lxml 处理嵌套 div 结构lightbox-wrapper、meta 等)
def _pick_imgs(block: str) -> str: if _lh is not None:
imgs = _re.findall(r'<img\b[^>]*>', block, flags=_re.IGNORECASE) try:
return "".join(imgs) tree = _lh.fromstring(f"<root>{cooked_html}</root>")
cooked_html = _re.sub( # lightbox-wrapper只保留内部 <img>,丢弃其余
r'<div[^>]*class="[^"]*lightbox-wrapper[^"]*"[^>]*>(.*?)</div>', for wrapper in tree.cssselect(".lightbox-wrapper"):
lambda m: _pick_imgs(m.group(1)), imgs = wrapper.cssselect("img")
cooked_html, parent = wrapper.getparent()
flags=_re.DOTALL, if parent is None:
) continue
idx = list(parent).index(wrapper)
for i, img in enumerate(imgs):
parent.insert(idx + i, img)
parent.remove(wrapper)
# 3) 退路:直接裸的 <a class="lightbox"> 包裹,剥 a、保留 img # 裸 <a class="lightbox">剥 a、保留子元素
cooked_html = _re.sub( for a in tree.cssselect("a.lightbox"):
parent = a.getparent()
if parent is None:
continue
idx = list(parent).index(a)
for i, child in enumerate(list(a)):
parent.insert(idx + i, child)
parent.remove(a)
# 删除 meta 信息块、文件名、代码块工具栏、download 按钮
for sel in [".meta", ".filename", ".codeblock-buttons",
".pre-actions", "a.download"]:
for el in tree.cssselect(sel):
el.getparent().remove(el)
cooked_html = "".join(
str(_lh.tostring(child, encoding="unicode"))
for child in tree
)
except Exception:
pass
# 3) 正则回退:处理 lxml 未覆盖的残留
cooked_html = re.sub(
r'<a [^>]*class="[^"]*\blightbox\b[^"]*"[^>]*>(.*?)</a>', r'<a [^>]*class="[^"]*\blightbox\b[^"]*"[^>]*>(.*?)</a>',
r'\1', r'\1',
cooked_html, cooked_html,
flags=_re.DOTALL, flags=re.DOTALL,
)
# 4) 删除所有残留的 meta 信息块(文件尺寸、文件名、下载按钮等)
cooked_html = _re.sub(
r'<div[^>]*class="[^"]*\bmeta\b[^"]*"[^>]*>.*?</div>',
'',
cooked_html,
flags=_re.DOTALL,
)
cooked_html = _re.sub(
r'<span[^>]*class="[^"]*\bfilename\b[^"]*"[^>]*>.*?</span>',
'',
cooked_html,
flags=_re.DOTALL,
)
# 5) 删除代码块顶部的工具栏copy/undo 按钮)防止占位
cooked_html = _re.sub(
r'<div[^>]*class="[^"]*\bcodeblock-buttons\b[^"]*"[^>]*>.*?</div>',
'',
cooked_html,
flags=_re.DOTALL,
)
cooked_html = _re.sub(
r'<pre[^>]*>\s*<div[^>]*class="[^"]*\bpre-actions\b[^"]*"[^>]*>.*?</div>',
'<pre>',
cooked_html,
flags=_re.DOTALL,
)
# 6) 删除 download 按钮、悬浮提示等装饰
cooked_html = _re.sub(
r'<a[^>]*class="[^"]*\bdownload[^"]*"[^>]*>.*?</a>',
'',
cooked_html,
flags=_re.DOTALL,
) )
except Exception: except Exception:
@@ -1087,66 +1104,68 @@ class LinuxDoPreviewPlugin(Star):
if not ctx: if not ctx:
return None return None
page = ctx.new_page() page = ctx.new_page()
page.set_viewport_size({"width": 820, "height": 1200})
# 设置内容,等待图片资源加载
page.set_content(html, wait_until="domcontentloaded", timeout=timeout_ms)
# 主动等所有 <img> 加载完成(最多 3s并剔除加载失败的图
page.evaluate("""() => new Promise(resolve => {
const imgs = document.querySelectorAll('img');
if (!imgs.length) return resolve();
let done = 0;
const tick = (img) => {
done++;
// 图加载失败:移除 <img> 避免占位巨大空白
if (img.complete && img.naturalWidth === 0) {
img.remove();
}
if (done >= imgs.length) resolve();
};
imgs.forEach(img => {
if (img.complete) tick(img);
else {
img.addEventListener('load', () => tick(img), { once: true });
img.addEventListener('error', () => tick(img), { once: true });
}
});
setTimeout(resolve, 3000);
})""")
page.wait_for_timeout(300)
# ── 自适应截图:总是优先对 .card 元素截图,按内容实际边界拍 ──
# 元素截图零空白、零截断,不受 viewport 高度限制。
# `screenshot_full_page` 仅作为后备回退:元素截图失败时才使用。
card_locator = page.locator(".card")
full_page = self.config.get("screenshot_full_page", True)
try: try:
if card_locator.count() > 0: page.set_viewport_size({"width": 820, "height": 1200})
card_locator.first.screenshot(
path=str(save_path), # 设置内容,等待图片资源加载
timeout=timeout_ms, page.set_content(html, wait_until="domcontentloaded", timeout=timeout_ms)
)
else: # 主动等所有 <img> 加载完成(最多 3s并剔除加载失败的图
page.evaluate("""() => new Promise(resolve => {
const imgs = document.querySelectorAll('img');
if (!imgs.length) return resolve();
let done = 0;
const tick = (img) => {
done++;
// 图加载失败:移除 <img> 避免占位巨大空白
if (img.complete && img.naturalWidth === 0) {
img.remove();
}
if (done >= imgs.length) resolve();
};
imgs.forEach(img => {
if (img.complete) tick(img);
else {
img.addEventListener('load', () => tick(img), { once: true });
img.addEventListener('error', () => tick(img), { once: true });
}
});
setTimeout(resolve, 3000);
})""")
page.wait_for_timeout(300)
# ── 自适应截图:总是优先对 .card 元素截图,按内容实际边界拍 ──
# 元素截图零空白、零截断,不受 viewport 高度限制。
# `screenshot_full_page` 仅作为后备回退:元素截图失败时才使用。
card_locator = page.locator(".card")
full_page = self.config.get("screenshot_full_page", True)
try:
if card_locator.count() > 0:
card_locator.first.screenshot(
path=str(save_path),
timeout=timeout_ms,
)
else:
page.screenshot(
path=str(save_path),
full_page=full_page,
timeout=timeout_ms,
)
except Exception:
# 回退:若元素截图失败(少见),退到全页截图
page.screenshot( page.screenshot(
path=str(save_path), path=str(save_path),
full_page=full_page, full_page=full_page,
timeout=timeout_ms, timeout=timeout_ms,
) )
except Exception: sz = save_path.stat().st_size
# 回退:若元素截图失败(少见),退到全页截图 logger.info(
page.screenshot( f"[LinuxDoPreview] 渲染截图: {save_path.name} ({sz / 1024:.1f} KB)"
path=str(save_path),
full_page=full_page,
timeout=timeout_ms,
) )
sz = save_path.stat().st_size return save_path
logger.info( finally:
f"[LinuxDoPreview] 渲染截图: {save_path.name} ({sz / 1024:.1f} KB)" page.close()
)
page.close()
return save_path
except Exception as e: except Exception as e:
logger.warning(f"[LinuxDoPreview] HTML 渲染失败: {type(e).__name__}: {e}") logger.warning(f"[LinuxDoPreview] HTML 渲染失败: {type(e).__name__}: {e}")
return None return None