-
Notifications
You must be signed in to change notification settings - Fork 9.6k
feat(route/baidu): add BAIDU_COOKIE support and extract shared … #21663
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
e9453f8
7b880dd
e083661
e2d9a67
4b80d98
7d7efc7
47415db
8b3a080
fc7ae10
e486eb3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,6 +85,7 @@ type ConfigEnvKeys = | |
| | 'BITBUCKET_USERNAME' | ||
| | 'BITBUCKET_PASSWORD' | ||
| | 'BTBYR_HOST' | ||
| | 'BAIDU_COOKIE' | ||
| | 'BTBYR_COOKIE' | ||
| | 'BUPT_PORTAL_COOKIE' | ||
| | 'CAIXIN_COOKIE' | ||
|
|
@@ -348,6 +349,9 @@ export type Config = { | |
| }; | ||
|
|
||
| // Route-specific Configurations | ||
| baidu: { | ||
| cookie?: string; | ||
| }; | ||
| bilibili: { | ||
| cookies: Record<string, string | undefined>; | ||
| dmImgList?: string; | ||
|
|
@@ -763,7 +767,7 @@ const calculateValue = () => { | |
| type: envs.CACHE_TYPE || (envs.CACHE_TYPE === '' ? '' : 'memory'), // 缓存类型,支持 'memory' 和 'redis',设为空可以禁止缓存 | ||
| requestTimeout: toInt(envs.CACHE_REQUEST_TIMEOUT, 60), | ||
| routeExpire: toInt(envs.CACHE_EXPIRE, 5 * 60), // 路由缓存时间,单位为秒 | ||
| contentExpire: toInt(envs.CACHE_CONTENT_EXPIRE, 1 * 60 * 60), // 不变内容缓存时间,单位为秒 | ||
| contentExpire: toInt(envs.CACHE_CONTENT_EXPIRE, 60 * 60), // 不变内容缓存时间,单位为秒 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do not make any changes to the default value. |
||
| }, | ||
| memory: { | ||
| max: toInt(envs.MEMORY_MAX, Math.pow(2, 8)), // The maximum number of items that remain in the cache. This must be a positive finite intger. | ||
|
|
@@ -843,6 +847,9 @@ const calculateValue = () => { | |
| }, | ||
|
|
||
| // Route-specific Configurations | ||
| baidu: { | ||
| cookie: envs.BAIDU_COOKIE, | ||
| }, | ||
| bilibili: { | ||
| cookies: bilibili_cookies, | ||
| dmImgList: envs.BILIBILI_DM_IMG_LIST, | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,122 @@ | ||||||||
| import { config } from '@/config'; | ||||||||
| import ConfigNotFoundError from '@/errors/types/config-not-found'; | ||||||||
| import cache from '@/utils/cache'; | ||||||||
|
|
||||||||
| /** | ||||||||
| * 解析百度 cookie 字符串为 Puppeteer 可用的 cookie 对象数组 | ||||||||
| * 正确处理包含 '=' 的 cookie 值 | ||||||||
| */ | ||||||||
| export function parseBaiduCookies(cookieStr: string): Array<{ name: string; value: string; domain: string }> { | ||||||||
| return cookieStr | ||||||||
| .split(';') | ||||||||
| .map((c) => c.trim()) | ||||||||
| .filter((c) => c.length > 0) | ||||||||
| .map((c) => { | ||||||||
| const firstEqualIndex = c.indexOf('='); | ||||||||
| if (firstEqualIndex === -1) { | ||||||||
| return { name: c, value: '', domain: '.tieba.baidu.com' }; | ||||||||
| } | ||||||||
| const name = c.slice(0, firstEqualIndex).trim(); | ||||||||
| const value = c.slice(firstEqualIndex + 1).trim(); | ||||||||
| return { name, value, domain: '.tieba.baidu.com' }; | ||||||||
| }); | ||||||||
| } | ||||||||
|
Comment on lines
+9
to
+23
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do not reinventing the wheels. |
||||||||
|
|
||||||||
| /** | ||||||||
| * 检查 HTML 内容是否包含百度安全验证页面 | ||||||||
| */ | ||||||||
| export function checkSecurityVerification(html: string): void { | ||||||||
| if (html.includes('安全验证') || html.includes('百度安全验证')) { | ||||||||
| throw new Error('Baidu security verification required. The cookie may be expired or invalid. Please update your BAIDU_COOKIE.'); | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| /** | ||||||||
| * 使用 Puppeteer 获取贴吧页面内容 | ||||||||
| * 包含统一的 cookie 设置、安全验证检查和缓存逻辑 | ||||||||
| * 带有重试机制处理瞬态错误 | ||||||||
| */ | ||||||||
| export async function getTiebaPageContent( | ||||||||
| url: string, | ||||||||
| cacheKey: string, | ||||||||
| options: { | ||||||||
| waitForSelector?: string; | ||||||||
| timeout?: number; | ||||||||
| retries?: number; | ||||||||
| } = {} | ||||||||
| ): Promise<string> { | ||||||||
| const cookie = config.baidu.cookie; | ||||||||
|
|
||||||||
| if (!cookie) { | ||||||||
| throw new ConfigNotFoundError('Baidu Tieba RSS is disabled due to the lack of <a href="https://docs.rsshub.app/deploy/config#baidu">BAIDU_COOKIE</a>'); | ||||||||
| } | ||||||||
|
|
||||||||
| const { getPuppeteerPage } = await import('@/utils/puppeteer'); | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No await import |
||||||||
| const { waitForSelector = '.thread-card-wrapper, .virtual-list-item, .thread-content-box, .thread-card', timeout = 3000, retries = 3 } = options; | ||||||||
|
|
||||||||
| const data = await cache.tryGet( | ||||||||
| cacheKey, | ||||||||
| async () => { | ||||||||
| let lastError: Error | undefined; | ||||||||
|
|
||||||||
| /* eslint-disable no-await-in-loop -- Intentional sequential retry logic */ | ||||||||
| for (let attempt = 0; attempt < retries; attempt++) { | ||||||||
| const { page, destroy } = await getPuppeteerPage(url, { noGoto: true }); | ||||||||
|
|
||||||||
| try { | ||||||||
| // 设置 Cookie(在访问页面前设置,减少一次导航) | ||||||||
| const cookies = parseBaiduCookies(cookie); | ||||||||
| await page.setCookie(...cookies); | ||||||||
|
|
||||||||
| // 访问目标页面 - 使用更宽松的等待条件 | ||||||||
| await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 }); | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The puppeteer session will be killed Lines 154 to 156 in 870a093
|
||||||||
|
|
||||||||
| // 等待页面稳定 | ||||||||
| await new Promise((resolve) => setTimeout(resolve, 2000)); | ||||||||
|
|
||||||||
| // 动态等待内容加载 | ||||||||
| try { | ||||||||
| await page.waitForSelector(waitForSelector, { timeout }); | ||||||||
| } catch { | ||||||||
| // 如果超时,继续执行 | ||||||||
| } | ||||||||
|
|
||||||||
| return await page.content(); | ||||||||
| } catch (error) { | ||||||||
| lastError = error as Error; | ||||||||
| // 如果是最后一次尝试,抛出错误 | ||||||||
| if (attempt === retries - 1) { | ||||||||
| throw lastError; | ||||||||
| } | ||||||||
| // 等待后重试 | ||||||||
| await new Promise((resolve) => setTimeout(resolve, 1000 * (attempt + 1))); | ||||||||
| } finally { | ||||||||
| await destroy(); | ||||||||
| } | ||||||||
| } | ||||||||
| /* eslint-enable no-await-in-loop */ | ||||||||
|
|
||||||||
| throw lastError || new Error('Failed to fetch page content'); | ||||||||
| }, | ||||||||
| config.cache.routeExpire, | ||||||||
| false | ||||||||
| ); | ||||||||
|
|
||||||||
| const html = data as string; | ||||||||
| checkSecurityVerification(html); | ||||||||
| return html; | ||||||||
| } | ||||||||
|
|
||||||||
| /** | ||||||||
| * 规范化 URL 为绝对地址 | ||||||||
| */ | ||||||||
| export function normalizeUrl(href: string, base: string = 'https://tieba.baidu.com'): string { | ||||||||
| if (!href) { | ||||||||
| return ''; | ||||||||
| } | ||||||||
| if (href.startsWith('http')) { | ||||||||
| return href; | ||||||||
| } | ||||||||
| const path = href.startsWith('/') ? href : `/${href}`; | ||||||||
| return `${base}${path}`; | ||||||||
| } | ||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,85 +1,101 @@ | ||
| import { load } from 'cheerio'; | ||
| import { raw } from 'hono/html'; | ||
| import { renderToString } from 'hono/jsx/dom/server'; | ||
|
|
||
| import type { Route } from '@/types'; | ||
| import got from '@/utils/got'; | ||
| import { parseDate } from '@/utils/parse-date'; | ||
| import timezone from '@/utils/timezone'; | ||
|
|
||
| import { getTiebaPageContent, normalizeUrl } from './common'; | ||
| import { parseRelativeTime, parseThreads } from './utils'; | ||
|
|
||
| export const route: Route = { | ||
| path: ['/tieba/forum/good/:kw/:cid?/:sortBy?', '/tieba/forum/:kw/:sortBy?'], | ||
| categories: ['bbs'], | ||
| example: '/baidu/tieba/forum/good/女图', | ||
| parameters: { kw: '吧名', cid: '精品分类,默认为 `0`(全部分类),如果不传 `cid` 则获取全部分类', sortBy: '排序方式:`created`, `replied`。默认为 `created`' }, | ||
| features: { | ||
| requireConfig: false, | ||
| requirePuppeteer: false, | ||
| antiCrawler: false, | ||
| requireConfig: [ | ||
| { | ||
| name: 'BAIDU_COOKIE', | ||
| optional: false, | ||
| description: '百度 cookie 值,用于需要登录的贴吧页面', | ||
| }, | ||
| ], | ||
| requirePuppeteer: true, | ||
| antiCrawler: true, | ||
| supportBT: false, | ||
| supportPodcast: false, | ||
| supportScihub: false, | ||
| }, | ||
| name: '精品帖子', | ||
| maintainers: ['u3u'], | ||
| maintainers: ['u3u', 'FlanChanXwO'], | ||
| handler, | ||
| }; | ||
|
|
||
| async function handler(ctx) { | ||
| // sortBy: created, replied | ||
| const { kw, cid = '0', sortBy = 'created' } = ctx.req.param(); | ||
| const sortParam = sortBy === 'replied' ? '&sc=67108864' : ''; | ||
|
|
||
| // PC端:https://tieba.baidu.com/f?kw=${encodeURIComponent(kw)} | ||
| // 移动端接口:https://tieba.baidu.com/mo/q/m?kw=${encodeURIComponent(kw)}&lp=5024&forum_recommend=1&lm=0&cid=0&has_url_param=1&pn=0&is_ajax=1 | ||
| const params = { kw: encodeURIComponent(kw) }; | ||
| ctx.req.path.includes('good') && (params.tab = 'good'); | ||
| cid && (params.cid = cid); | ||
| const { data } = await got(`https://tieba.baidu.com/f`, { | ||
| headers: { | ||
| Referer: 'https://tieba.baidu.com/', | ||
| }, | ||
| searchParams: params, | ||
| }); | ||
| // 固定抓取3页,约30条帖子 | ||
| const maxPages = 3; | ||
|
|
||
| // 并发获取所有页面 | ||
| const pagePromises = []; | ||
| for (let pageNum = 0; pageNum < maxPages; pageNum++) { | ||
| const pageUrl = `https://tieba.baidu.com/f?kw=${encodeURIComponent(kw)}&pn=${pageNum * 50}${cid === '0' ? '' : `&cid=${cid}`}${ctx.req.path.includes('good') ? '&tab=good' : ''}${pageNum === 0 ? '' : '&ie=utf-8'}${sortParam}`; | ||
|
|
||
| const promise = getTiebaPageContent(pageUrl, `tieba:forum:${kw}:${cid}:${sortBy}:page${pageNum}`, { waitForSelector: '.thread-card-wrapper', timeout: 3000 }); | ||
| pagePromises.push(promise); | ||
| } | ||
|
|
||
| const threadListHTML = load(data)('code[id="pagelet_html_frs-list/pagelet/thread_list"]') | ||
| .contents() | ||
| .filter((e) => e.nodeType === '8'); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No page turn. Extract content from the first page only. |
||
| // 等待所有页面获取完成 | ||
| const pageResults = await Promise.all(pagePromises); | ||
|
|
||
| const $ = load(threadListHTML.prevObject[0].data); | ||
| const list = $('#thread_list > .j_thread_list[data-field]') | ||
| .toArray() | ||
| .map((element) => { | ||
| const item = $(element); | ||
| const { id, author_name } = item.data('field'); | ||
| const time = sortBy === 'created' ? item.find('.is_show_create_time').text().trim() : item.find('.threadlist_reply_date').text().trim(); | ||
| const title = item.find('a.j_th_tit').text().trim(); | ||
| const details = item.find('.threadlist_abs').text().trim(); | ||
| const medias = item | ||
| .find('.threadlist_media img') | ||
| .toArray() | ||
| .map((element) => { | ||
| const item = $(element); | ||
| return `<img src="${item.attr('bpic')}">`; | ||
| }) | ||
| .join(''); | ||
| // 解析所有页面数据并去重 | ||
| const threadMap = new Map(); | ||
| for (const html of pageResults) { | ||
| if (html && html.length > 0) { | ||
| const $ = load(html); | ||
| const threads = parseThreads($); | ||
| for (const thread of threads) { | ||
| // 使用帖子ID去重,只保留第一次出现的 | ||
| if (!threadMap.has(thread.id)) { | ||
| threadMap.set(thread.id, thread); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return { | ||
| title, | ||
| description: renderToString( | ||
| <> | ||
| <p>{details}</p> | ||
| <p>{raw(medias)}</p> | ||
| <p>作者:{author_name}</p> | ||
| </> | ||
| ), | ||
| pubDate: timezone(parseDate(time, ['HH:mm', 'M-D', 'YYYY-MM'], true), +8), | ||
| link: `https://tieba.baidu.com/p/${id}`, | ||
| }; | ||
| }); | ||
| const allThreads = [...threadMap.values()]; | ||
|
|
||
| if (allThreads.length === 0) { | ||
| throw new Error('No threads found. The cookie may be expired or invalid. Please check your BAIDU_COOKIE.'); | ||
| } | ||
|
|
||
| const list = allThreads.map((thread) => { | ||
| const parsedDate = parseRelativeTime(thread.time); | ||
| return { | ||
| title: thread.title, | ||
| link: normalizeUrl(thread.link) || `https://tieba.baidu.com/p/${thread.id}`, | ||
| pubDate: parsedDate ? timezone(parsedDate, +8) : undefined, | ||
| author: thread.author, | ||
| description: renderToString( | ||
| <> | ||
| {thread.content ? <p>{thread.content}</p> : null} | ||
| {thread.images && thread.images.length > 0 ? ( | ||
| <div> | ||
| {thread.images.map((img) => ( | ||
| <img src={img} alt="" style={{ maxWidth: '100%', margin: '5px 0' }} /> | ||
| ))} | ||
| </div> | ||
| ) : null} | ||
| </> | ||
| ), | ||
| }; | ||
| }); | ||
|
|
||
| return { | ||
| title: `${kw}吧`, | ||
| description: load(data)('meta[name="description"]').attr('content'), | ||
| link: `https://tieba.baidu.com/f?kw=${encodeURIComponent(kw)}`, | ||
| item: list, | ||
| }; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Acomes beforeIandT