Compare commits
1 Commits
main
...
rainycy-sn
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4412df0949 |
621
main.py
621
main.py
@@ -58,6 +58,7 @@ class LinuxDoPreviewPlugin(Star):
|
|||||||
# 登录状态:跨 fetch 复用在同一 StealthySession 中
|
# 登录状态:跨 fetch 复用在同一 StealthySession 中
|
||||||
self._auth_check_done = False
|
self._auth_check_done = False
|
||||||
self._logged_in = False
|
self._logged_in = False
|
||||||
|
self._auth_lock = threading.Lock()
|
||||||
|
|
||||||
async def terminate(self):
|
async def terminate(self):
|
||||||
_EXECUTOR.shutdown(wait=False)
|
_EXECUTOR.shutdown(wait=False)
|
||||||
@@ -203,173 +204,175 @@ class LinuxDoPreviewPlugin(Star):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
page = ctx.new_page()
|
page = ctx.new_page()
|
||||||
page.set_viewport_size({"width": 1280, "height": 900})
|
|
||||||
|
|
||||||
# ── 导航:等 networkidle 确保 JS 动态内容加载完成 ──
|
|
||||||
page.goto(url, wait_until="networkidle", timeout=timeout_ms)
|
|
||||||
|
|
||||||
# ── 等待 Discourse 帖子内容渲染 ──
|
|
||||||
try:
|
try:
|
||||||
page.wait_for_selector("#post_1", timeout=min(timeout_ms, 10000))
|
page.set_viewport_size({"width": 1280, "height": 900})
|
||||||
except Exception:
|
|
||||||
page.wait_for_timeout(3000) # 回退:固定等待
|
|
||||||
|
|
||||||
# ── 隐藏非楼主内容,只保留第一篇帖子完整展示 ──
|
# ── 导航:等 networkidle 确保 JS 动态内容加载完成 ──
|
||||||
page.evaluate("""() => {
|
page.goto(url, wait_until="networkidle", timeout=timeout_ms)
|
||||||
const hide = (sel) => {
|
|
||||||
const el = document.querySelector(sel);
|
|
||||||
if (el) el.style.display = 'none';
|
|
||||||
};
|
|
||||||
hide('.d-header'); // 顶部导航栏
|
|
||||||
hide('.sidebar-wrapper'); // 左侧边栏
|
|
||||||
hide('.topic-navigation-wrapper'); // 帖子导航条
|
|
||||||
hide('.footer-nav.visible'); // 底部导航
|
|
||||||
hide('.post-stream'); // 隐藏整个帖子流(后面单独显示楼主)
|
|
||||||
|
|
||||||
// 隐藏所有回复帖子,只保留楼主
|
# ── 等待 Discourse 帖子内容渲染 ──
|
||||||
const posts = document.querySelectorAll('.topic-post');
|
try:
|
||||||
posts.forEach((post, i) => { if (i > 0) post.style.display = 'none'; });
|
page.wait_for_selector("#post_1", timeout=min(timeout_ms, 10000))
|
||||||
|
except Exception:
|
||||||
|
page.wait_for_timeout(3000) # 回退:固定等待
|
||||||
|
|
||||||
// 滚动到顶部
|
# ── 隐藏非楼主内容,只保留第一篇帖子完整展示 ──
|
||||||
window.scrollTo(0, 0);
|
page.evaluate("""() => {
|
||||||
}""")
|
const hide = (sel) => {
|
||||||
|
const el = document.querySelector(sel);
|
||||||
|
if (el) el.style.display = 'none';
|
||||||
|
};
|
||||||
|
hide('.d-header'); // 顶部导航栏
|
||||||
|
hide('.sidebar-wrapper'); // 左侧边栏
|
||||||
|
hide('.topic-navigation-wrapper'); // 帖子导航条
|
||||||
|
hide('.footer-nav.visible'); // 底部导航
|
||||||
|
hide('.post-stream'); // 隐藏整个帖子流(后面单独显示楼主)
|
||||||
|
|
||||||
# ── 展开 Discourse 截断的长帖 ──
|
// 隐藏所有回复帖子,只保留楼主
|
||||||
page.evaluate("""() => {
|
const posts = document.querySelectorAll('.topic-post');
|
||||||
// 移除所有展开按钮和截断遮罩
|
posts.forEach((post, i) => { if (i > 0) post.style.display = 'none'; });
|
||||||
const removeSelectors = [
|
|
||||||
'.expand-post',
|
|
||||||
'.gap-bottom',
|
|
||||||
'.gap',
|
|
||||||
'.large-post-container .show-more',
|
|
||||||
'.topic-body .show-more',
|
|
||||||
'.cooked .show-more',
|
|
||||||
'.lightbox',
|
|
||||||
];
|
|
||||||
removeSelectors.forEach(sel => {
|
|
||||||
document.querySelectorAll(sel).forEach(el => el.remove());
|
|
||||||
});
|
|
||||||
|
|
||||||
// 移除所有 max-height / overflow 限制
|
// 滚动到顶部
|
||||||
const unclampSelectors = [
|
window.scrollTo(0, 0);
|
||||||
'.cooked',
|
}""")
|
||||||
'.topic-body',
|
|
||||||
'#post_1 .cooked',
|
# ── 展开 Discourse 截断的长帖 ──
|
||||||
'#post_1 .topic-body',
|
page.evaluate("""() => {
|
||||||
'#post_1 .contents',
|
// 移除所有展开按钮和截断遮罩
|
||||||
'.large-post-container',
|
const removeSelectors = [
|
||||||
];
|
'.expand-post',
|
||||||
unclampSelectors.forEach(sel => {
|
'.gap-bottom',
|
||||||
document.querySelectorAll(sel).forEach(el => {
|
'.gap',
|
||||||
|
'.large-post-container .show-more',
|
||||||
|
'.topic-body .show-more',
|
||||||
|
'.cooked .show-more',
|
||||||
|
'.lightbox',
|
||||||
|
];
|
||||||
|
removeSelectors.forEach(sel => {
|
||||||
|
document.querySelectorAll(sel).forEach(el => el.remove());
|
||||||
|
});
|
||||||
|
|
||||||
|
// 移除所有 max-height / overflow 限制
|
||||||
|
const unclampSelectors = [
|
||||||
|
'.cooked',
|
||||||
|
'.topic-body',
|
||||||
|
'#post_1 .cooked',
|
||||||
|
'#post_1 .topic-body',
|
||||||
|
'#post_1 .contents',
|
||||||
|
'.large-post-container',
|
||||||
|
];
|
||||||
|
unclampSelectors.forEach(sel => {
|
||||||
|
document.querySelectorAll(sel).forEach(el => {
|
||||||
|
el.style.maxHeight = 'none';
|
||||||
|
el.style.overflow = 'visible';
|
||||||
|
el.style.height = 'auto';
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// 展开 Discourse 长帖截断(data-* 属性方式)
|
||||||
|
document.querySelectorAll('[data-expanded]').forEach(el => {
|
||||||
|
el.setAttribute('data-expanded', 'true');
|
||||||
|
});
|
||||||
|
// 移除 truncated 标记
|
||||||
|
document.querySelectorAll('.truncated').forEach(el => {
|
||||||
|
el.classList.remove('truncated');
|
||||||
|
});
|
||||||
|
}""")
|
||||||
|
|
||||||
|
# ── 点击可能存在的展开按钮 ──
|
||||||
|
try:
|
||||||
|
expand_buttons = page.query_selector_all(
|
||||||
|
'#post_1 .expand-post, #post_1 .show-more, '
|
||||||
|
'#post_1 button[class*="expand"], '
|
||||||
|
'#post_1 a[class*="expand"]'
|
||||||
|
)
|
||||||
|
for btn in expand_buttons:
|
||||||
|
try:
|
||||||
|
btn.click()
|
||||||
|
page.wait_for_timeout(300)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ── 再次展开,防止点击按钮后重新截断 ──
|
||||||
|
page.evaluate("""() => {
|
||||||
|
['#post_1 .cooked', '#post_1 .topic-body', '#post_1 .contents'].forEach(sel => {
|
||||||
|
document.querySelectorAll(sel).forEach(el => {
|
||||||
|
el.style.maxHeight = 'none';
|
||||||
|
el.style.overflow = 'visible';
|
||||||
|
el.style.height = 'auto';
|
||||||
|
});
|
||||||
|
});
|
||||||
|
// 确保图片容器也不截断
|
||||||
|
document.querySelectorAll('#post_1 .lightbox-wrapper').forEach(el => {
|
||||||
el.style.maxHeight = 'none';
|
el.style.maxHeight = 'none';
|
||||||
el.style.overflow = 'visible';
|
el.style.overflow = 'visible';
|
||||||
el.style.height = 'auto';
|
|
||||||
});
|
});
|
||||||
});
|
}""")
|
||||||
|
|
||||||
// 展开 Discourse 长帖截断(data-* 属性方式)
|
# ── 滚动楼主帖子,触发懒加载图片 ──
|
||||||
document.querySelectorAll('[data-expanded]').forEach(el => {
|
post1_box = page.evaluate("""() => {
|
||||||
el.setAttribute('data-expanded', 'true');
|
const p1 = document.querySelector('#post_1');
|
||||||
});
|
if (!p1) return null;
|
||||||
// 移除 truncated 标记
|
const rect = p1.getBoundingClientRect();
|
||||||
document.querySelectorAll('.truncated').forEach(el => {
|
return { top: rect.top + window.scrollY, height: rect.height };
|
||||||
el.classList.remove('truncated');
|
}""")
|
||||||
});
|
if post1_box:
|
||||||
}""")
|
post_top = int(post1_box.get('top', 0))
|
||||||
|
post_height = int(post1_box.get('height', 0))
|
||||||
|
for y in range(post_top, post_top + post_height, 400):
|
||||||
|
page.evaluate(f"window.scrollTo(0, {y})")
|
||||||
|
page.wait_for_timeout(200)
|
||||||
|
else:
|
||||||
|
# 回退:滚动整个页面
|
||||||
|
total_height = page.evaluate("document.body.scrollHeight")
|
||||||
|
for y in range(0, total_height, 400):
|
||||||
|
page.evaluate(f"window.scrollTo(0, {y})")
|
||||||
|
page.wait_for_timeout(200)
|
||||||
|
|
||||||
# ── 点击可能存在的展开按钮 ──
|
# ── 等待图片加载完成 ──
|
||||||
try:
|
page.evaluate("""() => {
|
||||||
expand_buttons = page.query_selector_all(
|
return new Promise(resolve => {
|
||||||
'#post_1 .expand-post, #post_1 .show-more, '
|
const imgs = document.querySelectorAll('#post_1 img');
|
||||||
'#post_1 button[class*="expand"], '
|
let loaded = 0;
|
||||||
'#post_1 a[class*="expand"]'
|
const total = imgs.length;
|
||||||
)
|
if (total === 0) return resolve();
|
||||||
for btn in expand_buttons:
|
imgs.forEach(img => {
|
||||||
try:
|
if (img.complete) {
|
||||||
btn.click()
|
|
||||||
page.wait_for_timeout(300)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# ── 再次展开,防止点击按钮后重新截断 ──
|
|
||||||
page.evaluate("""() => {
|
|
||||||
['#post_1 .cooked', '#post_1 .topic-body', '#post_1 .contents'].forEach(sel => {
|
|
||||||
document.querySelectorAll(sel).forEach(el => {
|
|
||||||
el.style.maxHeight = 'none';
|
|
||||||
el.style.overflow = 'visible';
|
|
||||||
el.style.height = 'auto';
|
|
||||||
});
|
|
||||||
});
|
|
||||||
// 确保图片容器也不截断
|
|
||||||
document.querySelectorAll('#post_1 .lightbox-wrapper').forEach(el => {
|
|
||||||
el.style.maxHeight = 'none';
|
|
||||||
el.style.overflow = 'visible';
|
|
||||||
});
|
|
||||||
}""")
|
|
||||||
|
|
||||||
# ── 滚动楼主帖子,触发懒加载图片 ──
|
|
||||||
post1_box = page.evaluate("""() => {
|
|
||||||
const p1 = document.querySelector('#post_1');
|
|
||||||
if (!p1) return null;
|
|
||||||
const rect = p1.getBoundingClientRect();
|
|
||||||
return { top: rect.top + window.scrollY, height: rect.height };
|
|
||||||
}""")
|
|
||||||
if post1_box:
|
|
||||||
post_top = int(post1_box.get('top', 0))
|
|
||||||
post_height = int(post1_box.get('height', 0))
|
|
||||||
for y in range(post_top, post_top + post_height, 400):
|
|
||||||
page.evaluate(f"window.scrollTo(0, {y})")
|
|
||||||
page.wait_for_timeout(200)
|
|
||||||
else:
|
|
||||||
# 回退:滚动整个页面
|
|
||||||
total_height = page.evaluate("document.body.scrollHeight")
|
|
||||||
for y in range(0, total_height, 400):
|
|
||||||
page.evaluate(f"window.scrollTo(0, {y})")
|
|
||||||
page.wait_for_timeout(200)
|
|
||||||
|
|
||||||
# ── 等待图片加载完成 ──
|
|
||||||
page.evaluate("""() => {
|
|
||||||
return new Promise(resolve => {
|
|
||||||
const imgs = document.querySelectorAll('#post_1 img');
|
|
||||||
let loaded = 0;
|
|
||||||
const total = imgs.length;
|
|
||||||
if (total === 0) return resolve();
|
|
||||||
imgs.forEach(img => {
|
|
||||||
if (img.complete) {
|
|
||||||
loaded++;
|
|
||||||
if (loaded >= total) resolve();
|
|
||||||
} else {
|
|
||||||
img.onload = img.onerror = () => {
|
|
||||||
loaded++;
|
loaded++;
|
||||||
if (loaded >= total) resolve();
|
if (loaded >= total) resolve();
|
||||||
};
|
} else {
|
||||||
}
|
img.onload = img.onerror = () => {
|
||||||
|
loaded++;
|
||||||
|
if (loaded >= total) resolve();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
});
|
||||||
|
// 最多等 3 秒
|
||||||
|
setTimeout(resolve, 3000);
|
||||||
});
|
});
|
||||||
// 最多等 3 秒
|
}""")
|
||||||
setTimeout(resolve, 3000);
|
|
||||||
});
|
|
||||||
}""")
|
|
||||||
|
|
||||||
# ── 滚动回顶部 ──
|
# ── 滚动回顶部 ──
|
||||||
page.evaluate("window.scrollTo(0, 0)")
|
page.evaluate("window.scrollTo(0, 0)")
|
||||||
page.wait_for_timeout(500)
|
page.wait_for_timeout(500)
|
||||||
|
|
||||||
# ── 截图:全页模式,隐藏导航栏后内容干净 ──
|
# ── 截图:全页模式,隐藏导航栏后内容干净 ──
|
||||||
full_page = self.config.get("screenshot_full_page", True)
|
full_page = self.config.get("screenshot_full_page", True)
|
||||||
page.screenshot(
|
page.screenshot(
|
||||||
path=str(save_path),
|
path=str(save_path),
|
||||||
full_page=full_page,
|
full_page=full_page,
|
||||||
timeout=timeout_ms,
|
timeout=timeout_ms,
|
||||||
)
|
)
|
||||||
|
|
||||||
sz = save_path.stat().st_size
|
sz = save_path.stat().st_size
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[LinuxDoPreview] 截图保存: {save_path.name} ({sz / 1024:.1f} KB)"
|
f"[LinuxDoPreview] 截图保存: {save_path.name} ({sz / 1024:.1f} KB)"
|
||||||
)
|
)
|
||||||
page.close()
|
return save_path
|
||||||
return save_path
|
finally:
|
||||||
|
page.close()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[LinuxDoPreview] 截图失败: {type(e).__name__}: {e}")
|
logger.warning(f"[LinuxDoPreview] 截图失败: {type(e).__name__}: {e}")
|
||||||
@@ -377,54 +380,56 @@ class LinuxDoPreviewPlugin(Star):
|
|||||||
|
|
||||||
# ─────────── 文本提取 ───────────
|
# ─────────── 文本提取 ───────────
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _build_json_url(url: str) -> str:
|
||||||
|
"""将 topic URL 转换为对应的 JSON API URL"""
|
||||||
|
json_url = url.rstrip('/')
|
||||||
|
if not json_url.endswith('.json'):
|
||||||
|
parts = json_url.split('/')
|
||||||
|
t_idx = -1
|
||||||
|
for i, p in enumerate(parts):
|
||||||
|
if p == 't':
|
||||||
|
t_idx = i
|
||||||
|
break
|
||||||
|
if t_idx >= 0 and len(parts) > t_idx + 2:
|
||||||
|
json_url = '/'.join(parts[:t_idx + 3])
|
||||||
|
json_url += '.json'
|
||||||
|
return json_url
|
||||||
|
|
||||||
def _extract_content_from_json(self, session, url: str) -> str:
|
def _extract_content_from_json(self, session, url: str) -> str:
|
||||||
"""通过 Discourse JSON API 获取完整的楼主帖子内容
|
"""通过 Discourse JSON API 获取完整的楼主帖子内容
|
||||||
|
|
||||||
Discourse 的 .json 端点返回结构化数据,包含完整的 cooked HTML,
|
Discourse 的 .json 端点返回结构化数据,包含完整的 cooked HTML,
|
||||||
不受页面截断、懒加载或 Cloudflare 渲染问题的影响。
|
不受页面截断、懒加载或 Cloudflare 渲染问题的影响。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# 构造 JSON URL:topic-url.json 或 topic-url/1.json
|
json_url = self._build_json_url(url)
|
||||||
json_url = url.rstrip('/')
|
|
||||||
if not json_url.endswith('.json'):
|
|
||||||
# 对于帖子链接如 /t/topic-slug/12345/5,取前两段
|
|
||||||
parts = json_url.split('/')
|
|
||||||
# 找到 /t/ 后的部分
|
|
||||||
t_idx = -1
|
|
||||||
for i, p in enumerate(parts):
|
|
||||||
if p == 't':
|
|
||||||
t_idx = i
|
|
||||||
break
|
|
||||||
if t_idx >= 0 and len(parts) > t_idx + 2:
|
|
||||||
# 重建为 /t/slug/id 格式
|
|
||||||
json_url = '/'.join(parts[:t_idx + 3])
|
|
||||||
json_url += '.json'
|
|
||||||
|
|
||||||
logger.info(f"[LinuxDoPreview] JSON API 请求: {json_url}")
|
logger.info(f"[LinuxDoPreview] JSON API 请求: {json_url}")
|
||||||
resp = session.fetch(json_url)
|
resp = session.fetch(json_url)
|
||||||
if resp.status != 200:
|
if resp.status != 200:
|
||||||
logger.info(f"[LinuxDoPreview] JSON API 返回 {resp.status}")
|
logger.info(f"[LinuxDoPreview] JSON API 返回 {resp.status}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
data = json.loads(resp.body.decode("utf-8", errors="replace"))
|
data = json.loads(resp.body.decode("utf-8", errors="replace"))
|
||||||
|
|
||||||
# 从 post_stream 中提取第一个帖子(楼主)
|
# 从 post_stream 中提取第一个帖子(楼主)
|
||||||
post_stream = data.get("post_stream", {})
|
post_stream = data.get("post_stream", {})
|
||||||
posts = post_stream.get("posts", [])
|
posts = post_stream.get("posts", [])
|
||||||
if not posts:
|
if not posts:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
first_post = posts[0]
|
first_post = posts[0]
|
||||||
cooked_html = first_post.get("cooked", "")
|
cooked_html = first_post.get("cooked", "")
|
||||||
if not cooked_html:
|
if not cooked_html:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# 使用 lxml 解析 HTML 并提取纯文本
|
# 使用 lxml 解析 HTML 并提取纯文本
|
||||||
if _lh is not None:
|
if _lh is not None:
|
||||||
tree = _lh.fromstring(cooked_html)
|
tree = _lh.fromstring(cooked_html)
|
||||||
return _clean_text(tree.text_content())
|
return _clean_text(tree.text_content())
|
||||||
|
|
||||||
# 回退:正则去标签
|
# 回退:正则去标签
|
||||||
text = re.sub(r"<[^>]+>", " ", cooked_html)
|
text = re.sub(r"<[^>]+>", " ", cooked_html)
|
||||||
text = re.sub(r"\s+", " ", text).strip()
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
@@ -567,25 +572,34 @@ class LinuxDoPreviewPlugin(Star):
|
|||||||
if self._has_session_cookie():
|
if self._has_session_cookie():
|
||||||
cookie_value = (self.config.get("linuxdo_session_cookie", "") or "").strip()
|
cookie_value = (self.config.get("linuxdo_session_cookie", "") or "").strip()
|
||||||
if not self._inject_session_cookie(session, cookie_value):
|
if not self._inject_session_cookie(session, cookie_value):
|
||||||
self._auth_check_done = True
|
with self._auth_lock:
|
||||||
self._logged_in = False
|
self._auth_check_done = True
|
||||||
|
self._logged_in = False
|
||||||
return False
|
return False
|
||||||
# 校验结果只算一次(Cookie 有效性跨请求稳定)
|
# 校验结果只算一次(Cookie 有效性跨请求稳定)
|
||||||
if not self._auth_check_done:
|
with self._auth_lock:
|
||||||
self._logged_in = self._check_login_state(session)
|
need_check = not self._auth_check_done
|
||||||
self._auth_check_done = True
|
if need_check:
|
||||||
if self._logged_in:
|
logged_in = self._check_login_state(session)
|
||||||
|
with self._auth_lock:
|
||||||
|
self._logged_in = logged_in
|
||||||
|
self._auth_check_done = True
|
||||||
|
if logged_in:
|
||||||
logger.info("[LinuxDoPreview] Cookie 登录验证成功")
|
logger.info("[LinuxDoPreview] Cookie 登录验证成功")
|
||||||
else:
|
else:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"[LinuxDoPreview] 会话 Cookie 无效或已过期,将匿名访问。"
|
"[LinuxDoPreview] 会话 Cookie 无效或已过期,将匿名访问。"
|
||||||
"请在浏览器重新获取 Cookie(推荐 _t,长效)后填入配置。"
|
"请在浏览器重新获取 Cookie(推荐 _t,长效)后填入配置。"
|
||||||
)
|
)
|
||||||
return self._logged_in
|
with self._auth_lock:
|
||||||
|
return self._logged_in
|
||||||
|
|
||||||
# ── 仅用户名/密码:受 hCaptcha 限制,无法自动登录(仅提示一次) ──
|
# ── 仅用户名/密码:受 hCaptcha 限制,无法自动登录(仅提示一次) ──
|
||||||
if self._has_auto_login() and not self._auth_check_done:
|
with self._auth_lock:
|
||||||
self._auth_check_done = True
|
need_warn = self._has_auto_login() and not self._auth_check_done
|
||||||
|
if need_warn:
|
||||||
|
self._auth_check_done = True
|
||||||
|
if need_warn:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"[LinuxDoPreview] linux.do 登录启用了 hCaptcha 人机验证,账号密码"
|
"[LinuxDoPreview] linux.do 登录启用了 hCaptcha 人机验证,账号密码"
|
||||||
"自动登录不可用。请在浏览器登录 linux.do 后,F12 → Application → "
|
"自动登录不可用。请在浏览器登录 linux.do 后,F12 → Application → "
|
||||||
@@ -594,27 +608,18 @@ class LinuxDoPreviewPlugin(Star):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 都没配置 / 自动登录不可用 → 匿名
|
# 都没配置 / 自动登录不可用 → 匿名
|
||||||
self._logged_in = False
|
with self._auth_lock:
|
||||||
|
self._logged_in = False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _fetch_topic_data(self, session, url: str) -> dict | None:
|
def _fetch_topic_data(self, session, url: str) -> dict | None:
|
||||||
"""通过 Discourse JSON API 获取完整的主题数据
|
"""通过 Discourse JSON API 获取完整的主题数据
|
||||||
|
|
||||||
返回的 dict 包含帖子原始数据(cooked HTML、作者、标签、统计等),
|
返回的 dict 包含帖子原始数据(cooked HTML、作者、标签、统计等),
|
||||||
可同时供文本提取和自定义 HTML 渲染使用。
|
可同时供文本提取和自定义 HTML 渲染使用。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
json_url = url.rstrip('/')
|
json_url = self._build_json_url(url)
|
||||||
if not json_url.endswith('.json'):
|
|
||||||
parts = json_url.split('/')
|
|
||||||
t_idx = -1
|
|
||||||
for i, p in enumerate(parts):
|
|
||||||
if p == 't':
|
|
||||||
t_idx = i
|
|
||||||
break
|
|
||||||
if t_idx >= 0 and len(parts) > t_idx + 2:
|
|
||||||
json_url = '/'.join(parts[:t_idx + 3])
|
|
||||||
json_url += '.json'
|
|
||||||
|
|
||||||
logger.info(f"[LinuxDoPreview] 拉取 topic JSON: {json_url}")
|
logger.info(f"[LinuxDoPreview] 拉取 topic JSON: {json_url}")
|
||||||
resp = session.fetch(json_url)
|
resp = session.fetch(json_url)
|
||||||
@@ -749,7 +754,6 @@ class LinuxDoPreviewPlugin(Star):
|
|||||||
posts_count = topic_data.get("posts_count", 0)
|
posts_count = topic_data.get("posts_count", 0)
|
||||||
views = topic_data.get("views", 0)
|
views = topic_data.get("views", 0)
|
||||||
like_count = topic_data.get("like_count", 0)
|
like_count = topic_data.get("like_count", 0)
|
||||||
created_at = topic_data.get("created_at", "")
|
|
||||||
tags = topic_data.get("tags", []) or []
|
tags = topic_data.get("tags", []) or []
|
||||||
|
|
||||||
post_stream = topic_data.get("post_stream", {}) or {}
|
post_stream = topic_data.get("post_stream", {}) or {}
|
||||||
@@ -774,12 +778,14 @@ class LinuxDoPreviewPlugin(Star):
|
|||||||
if "{size}" in author_avatar:
|
if "{size}" in author_avatar:
|
||||||
author_avatar = author_avatar.replace("{size}", "120")
|
author_avatar = author_avatar.replace("{size}", "120")
|
||||||
post_created = first.get("created_at", "") or ""
|
post_created = first.get("created_at", "") or ""
|
||||||
post_like = first.get("like_count", 0)
|
|
||||||
cooked_html = first.get("cooked", "") or ""
|
cooked_html = first.get("cooked", "") or ""
|
||||||
|
|
||||||
# 把 Discourse 相对资源 URL 补全为绝对 URL
|
# 把 Discourse 相对资源 URL 补全为绝对 URL
|
||||||
cooked_html = self._normalize_cooked_urls(cooked_html)
|
cooked_html = self._normalize_cooked_urls(cooked_html)
|
||||||
|
|
||||||
|
# 消毒 HTML:移除危险标签和事件属性
|
||||||
|
cooked_html = self._sanitize_html(cooked_html)
|
||||||
|
|
||||||
# 发布时间格式化
|
# 发布时间格式化
|
||||||
created_text = ""
|
created_text = ""
|
||||||
if post_created:
|
if post_created:
|
||||||
@@ -981,6 +987,27 @@ class LinuxDoPreviewPlugin(Star):
|
|||||||
return f"{n/1000:.1f}k"
|
return f"{n/1000:.1f}k"
|
||||||
return str(n)
|
return str(n)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _sanitize_html(html_str: str) -> str:
|
||||||
|
"""消毒 HTML:移除 <script>、<iframe>、<object>、<embed> 及 on* 事件属性"""
|
||||||
|
if not html_str:
|
||||||
|
return ""
|
||||||
|
html_str = re.sub(
|
||||||
|
r'<\s*(script|iframe|object|embed|applet|form|input|textarea|button|select)\b[^>]*>.*?<\s*/\s*\1\s*>',
|
||||||
|
'', html_str, flags=re.IGNORECASE | re.DOTALL
|
||||||
|
)
|
||||||
|
html_str = re.sub(
|
||||||
|
r'<\s*(script|iframe|object|embed|applet|form|input|textarea|button|select)\b[^>]*/?\s*>',
|
||||||
|
'', html_str, flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
html_str = re.sub(r'\s+on\w+\s*=\s*"[^"]*"', '', html_str, flags=re.IGNORECASE)
|
||||||
|
html_str = re.sub(r"\s+on\w+\s*=\s*'[^']*'", '', html_str, flags=re.IGNORECASE)
|
||||||
|
html_str = re.sub(r'\s+on\w+\s*=\s*[^\s>]+', '', html_str, flags=re.IGNORECASE)
|
||||||
|
html_str = re.sub(
|
||||||
|
r'href\s*=\s*["\']?\s*javascript:', 'href="#', html_str, flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
return html_str
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _normalize_cooked_urls(cooked_html: str) -> str:
|
def _normalize_cooked_urls(cooked_html: str) -> str:
|
||||||
"""将 cooked 中的相对资源 URL 转绝对 URL,剥离轻臾框包裹与 meta 信息
|
"""将 cooked 中的相对资源 URL 转绝对 URL,剥离轻臾框包裹与 meta 信息
|
||||||
@@ -1000,73 +1027,63 @@ class LinuxDoPreviewPlugin(Star):
|
|||||||
if not cooked_html:
|
if not cooked_html:
|
||||||
return ""
|
return ""
|
||||||
try:
|
try:
|
||||||
import re as _re
|
|
||||||
# 1) 绝对化 src/href(相对与协议无关 URL)
|
# 1) 绝对化 src/href(相对与协议无关 URL)
|
||||||
cooked_html = _re.sub(
|
cooked_html = re.sub(
|
||||||
r'(src|href)="(//[^"]+)"',
|
r'(src|href)="(//[^"]+)"',
|
||||||
r'\1="https:\2',
|
r'\1="https:\2',
|
||||||
cooked_html,
|
cooked_html,
|
||||||
)
|
)
|
||||||
cooked_html = _re.sub(
|
cooked_html = re.sub(
|
||||||
r'(src|href)="(/uploads/[^"]+)"',
|
r'(src|href)="(/uploads/[^"]+)"',
|
||||||
r'\1="https://linux.do\2',
|
r'\1="https://linux.do\2',
|
||||||
cooked_html,
|
cooked_html,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 2) 整块剥离 lightbox-wrapper:仅保留内部 <img>,丢弃其余
|
# 2) 使用 lxml 处理嵌套 div 结构(lightbox-wrapper、meta 等)
|
||||||
def _pick_imgs(block: str) -> str:
|
if _lh is not None:
|
||||||
imgs = _re.findall(r'<img\b[^>]*>', block, flags=_re.IGNORECASE)
|
try:
|
||||||
return "".join(imgs)
|
tree = _lh.fromstring(f"<root>{cooked_html}</root>")
|
||||||
|
|
||||||
cooked_html = _re.sub(
|
# lightbox-wrapper:只保留内部 <img>,丢弃其余
|
||||||
r'<div[^>]*class="[^"]*lightbox-wrapper[^"]*"[^>]*>(.*?)</div>',
|
for wrapper in tree.cssselect(".lightbox-wrapper"):
|
||||||
lambda m: _pick_imgs(m.group(1)),
|
imgs = wrapper.cssselect("img")
|
||||||
cooked_html,
|
parent = wrapper.getparent()
|
||||||
flags=_re.DOTALL,
|
if parent is None:
|
||||||
)
|
continue
|
||||||
|
idx = list(parent).index(wrapper)
|
||||||
|
for i, img in enumerate(imgs):
|
||||||
|
parent.insert(idx + i, img)
|
||||||
|
parent.remove(wrapper)
|
||||||
|
|
||||||
# 3) 退路:直接裸的 <a class="lightbox"> 包裹,剥 a、保留 img
|
# 裸 <a class="lightbox">:剥 a、保留子元素
|
||||||
cooked_html = _re.sub(
|
for a in tree.cssselect("a.lightbox"):
|
||||||
|
parent = a.getparent()
|
||||||
|
if parent is None:
|
||||||
|
continue
|
||||||
|
idx = list(parent).index(a)
|
||||||
|
for i, child in enumerate(list(a)):
|
||||||
|
parent.insert(idx + i, child)
|
||||||
|
parent.remove(a)
|
||||||
|
|
||||||
|
# 删除 meta 信息块、文件名、代码块工具栏、download 按钮
|
||||||
|
for sel in [".meta", ".filename", ".codeblock-buttons",
|
||||||
|
".pre-actions", "a.download"]:
|
||||||
|
for el in tree.cssselect(sel):
|
||||||
|
el.getparent().remove(el)
|
||||||
|
|
||||||
|
cooked_html = "".join(
|
||||||
|
str(_lh.tostring(child, encoding="unicode"))
|
||||||
|
for child in tree
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 3) 正则回退:处理 lxml 未覆盖的残留
|
||||||
|
cooked_html = re.sub(
|
||||||
r'<a [^>]*class="[^"]*\blightbox\b[^"]*"[^>]*>(.*?)</a>',
|
r'<a [^>]*class="[^"]*\blightbox\b[^"]*"[^>]*>(.*?)</a>',
|
||||||
r'\1',
|
r'\1',
|
||||||
cooked_html,
|
cooked_html,
|
||||||
flags=_re.DOTALL,
|
flags=re.DOTALL,
|
||||||
)
|
|
||||||
|
|
||||||
# 4) 删除所有残留的 meta 信息块(文件尺寸、文件名、下载按钮等)
|
|
||||||
cooked_html = _re.sub(
|
|
||||||
r'<div[^>]*class="[^"]*\bmeta\b[^"]*"[^>]*>.*?</div>',
|
|
||||||
'',
|
|
||||||
cooked_html,
|
|
||||||
flags=_re.DOTALL,
|
|
||||||
)
|
|
||||||
cooked_html = _re.sub(
|
|
||||||
r'<span[^>]*class="[^"]*\bfilename\b[^"]*"[^>]*>.*?</span>',
|
|
||||||
'',
|
|
||||||
cooked_html,
|
|
||||||
flags=_re.DOTALL,
|
|
||||||
)
|
|
||||||
|
|
||||||
# 5) 删除代码块顶部的工具栏(copy/undo 按钮)防止占位
|
|
||||||
cooked_html = _re.sub(
|
|
||||||
r'<div[^>]*class="[^"]*\bcodeblock-buttons\b[^"]*"[^>]*>.*?</div>',
|
|
||||||
'',
|
|
||||||
cooked_html,
|
|
||||||
flags=_re.DOTALL,
|
|
||||||
)
|
|
||||||
cooked_html = _re.sub(
|
|
||||||
r'<pre[^>]*>\s*<div[^>]*class="[^"]*\bpre-actions\b[^"]*"[^>]*>.*?</div>',
|
|
||||||
'<pre>',
|
|
||||||
cooked_html,
|
|
||||||
flags=_re.DOTALL,
|
|
||||||
)
|
|
||||||
|
|
||||||
# 6) 删除 download 按钮、悬浮提示等装饰
|
|
||||||
cooked_html = _re.sub(
|
|
||||||
r'<a[^>]*class="[^"]*\bdownload[^"]*"[^>]*>.*?</a>',
|
|
||||||
'',
|
|
||||||
cooked_html,
|
|
||||||
flags=_re.DOTALL,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -1087,66 +1104,68 @@ class LinuxDoPreviewPlugin(Star):
|
|||||||
if not ctx:
|
if not ctx:
|
||||||
return None
|
return None
|
||||||
page = ctx.new_page()
|
page = ctx.new_page()
|
||||||
page.set_viewport_size({"width": 820, "height": 1200})
|
|
||||||
|
|
||||||
# 设置内容,等待图片资源加载
|
|
||||||
page.set_content(html, wait_until="domcontentloaded", timeout=timeout_ms)
|
|
||||||
|
|
||||||
# 主动等所有 <img> 加载完成(最多 3s),并剔除加载失败的图
|
|
||||||
page.evaluate("""() => new Promise(resolve => {
|
|
||||||
const imgs = document.querySelectorAll('img');
|
|
||||||
if (!imgs.length) return resolve();
|
|
||||||
let done = 0;
|
|
||||||
const tick = (img) => {
|
|
||||||
done++;
|
|
||||||
// 图加载失败:移除 <img> 避免占位巨大空白
|
|
||||||
if (img.complete && img.naturalWidth === 0) {
|
|
||||||
img.remove();
|
|
||||||
}
|
|
||||||
if (done >= imgs.length) resolve();
|
|
||||||
};
|
|
||||||
imgs.forEach(img => {
|
|
||||||
if (img.complete) tick(img);
|
|
||||||
else {
|
|
||||||
img.addEventListener('load', () => tick(img), { once: true });
|
|
||||||
img.addEventListener('error', () => tick(img), { once: true });
|
|
||||||
}
|
|
||||||
});
|
|
||||||
setTimeout(resolve, 3000);
|
|
||||||
})""")
|
|
||||||
|
|
||||||
page.wait_for_timeout(300)
|
|
||||||
|
|
||||||
# ── 自适应截图:总是优先对 .card 元素截图,按内容实际边界拍 ──
|
|
||||||
# 元素截图零空白、零截断,不受 viewport 高度限制。
|
|
||||||
# `screenshot_full_page` 仅作为后备回退:元素截图失败时才使用。
|
|
||||||
card_locator = page.locator(".card")
|
|
||||||
full_page = self.config.get("screenshot_full_page", True)
|
|
||||||
try:
|
try:
|
||||||
if card_locator.count() > 0:
|
page.set_viewport_size({"width": 820, "height": 1200})
|
||||||
card_locator.first.screenshot(
|
|
||||||
path=str(save_path),
|
# 设置内容,等待图片资源加载
|
||||||
timeout=timeout_ms,
|
page.set_content(html, wait_until="domcontentloaded", timeout=timeout_ms)
|
||||||
)
|
|
||||||
else:
|
# 主动等所有 <img> 加载完成(最多 3s),并剔除加载失败的图
|
||||||
|
page.evaluate("""() => new Promise(resolve => {
|
||||||
|
const imgs = document.querySelectorAll('img');
|
||||||
|
if (!imgs.length) return resolve();
|
||||||
|
let done = 0;
|
||||||
|
const tick = (img) => {
|
||||||
|
done++;
|
||||||
|
// 图加载失败:移除 <img> 避免占位巨大空白
|
||||||
|
if (img.complete && img.naturalWidth === 0) {
|
||||||
|
img.remove();
|
||||||
|
}
|
||||||
|
if (done >= imgs.length) resolve();
|
||||||
|
};
|
||||||
|
imgs.forEach(img => {
|
||||||
|
if (img.complete) tick(img);
|
||||||
|
else {
|
||||||
|
img.addEventListener('load', () => tick(img), { once: true });
|
||||||
|
img.addEventListener('error', () => tick(img), { once: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
setTimeout(resolve, 3000);
|
||||||
|
})""")
|
||||||
|
|
||||||
|
page.wait_for_timeout(300)
|
||||||
|
|
||||||
|
# ── 自适应截图:总是优先对 .card 元素截图,按内容实际边界拍 ──
|
||||||
|
# 元素截图零空白、零截断,不受 viewport 高度限制。
|
||||||
|
# `screenshot_full_page` 仅作为后备回退:元素截图失败时才使用。
|
||||||
|
card_locator = page.locator(".card")
|
||||||
|
full_page = self.config.get("screenshot_full_page", True)
|
||||||
|
try:
|
||||||
|
if card_locator.count() > 0:
|
||||||
|
card_locator.first.screenshot(
|
||||||
|
path=str(save_path),
|
||||||
|
timeout=timeout_ms,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
page.screenshot(
|
||||||
|
path=str(save_path),
|
||||||
|
full_page=full_page,
|
||||||
|
timeout=timeout_ms,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# 回退:若元素截图失败(少见),退到全页截图
|
||||||
page.screenshot(
|
page.screenshot(
|
||||||
path=str(save_path),
|
path=str(save_path),
|
||||||
full_page=full_page,
|
full_page=full_page,
|
||||||
timeout=timeout_ms,
|
timeout=timeout_ms,
|
||||||
)
|
)
|
||||||
except Exception:
|
sz = save_path.stat().st_size
|
||||||
# 回退:若元素截图失败(少见),退到全页截图
|
logger.info(
|
||||||
page.screenshot(
|
f"[LinuxDoPreview] 渲染截图: {save_path.name} ({sz / 1024:.1f} KB)"
|
||||||
path=str(save_path),
|
|
||||||
full_page=full_page,
|
|
||||||
timeout=timeout_ms,
|
|
||||||
)
|
)
|
||||||
sz = save_path.stat().st_size
|
return save_path
|
||||||
logger.info(
|
finally:
|
||||||
f"[LinuxDoPreview] 渲染截图: {save_path.name} ({sz / 1024:.1f} KB)"
|
page.close()
|
||||||
)
|
|
||||||
page.close()
|
|
||||||
return save_path
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[LinuxDoPreview] HTML 渲染失败: {type(e).__name__}: {e}")
|
logger.warning(f"[LinuxDoPreview] HTML 渲染失败: {type(e).__name__}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|||||||
Reference in New Issue
Block a user