|
| 1 | +import * as cheerio from "cheerio" |
| 2 | + |
| 3 | +// 定义文章统计信息接口 |
| 4 | +interface ArticleStats { |
| 5 | + views: number |
| 6 | + collections: number |
| 7 | +} |
| 8 | + |
| 9 | +// 定义作者信息接口 |
| 10 | +interface AuthorInfo { |
| 11 | + name: string |
| 12 | + avatar?: string |
| 13 | + profileUrl?: string |
| 14 | +} |
| 15 | + |
| 16 | +// 定义文章数据接口 |
| 17 | +interface ArticleData { |
| 18 | + title: string |
| 19 | + url: string |
| 20 | + description: string |
| 21 | + publishTime: string |
| 22 | + author: AuthorInfo |
| 23 | + stats: ArticleStats |
| 24 | + album?: string |
| 25 | + image?: string |
| 26 | + category?: string |
| 27 | +} |
| 28 | + |
| 29 | +// 辅助函数:安全提取文本 |
| 30 | +function safeExtract($element: cheerio.Cheerio<any>, selector: string): string { |
| 31 | + const result = $element.find(selector).first().text().trim() |
| 32 | + return result || "" |
| 33 | +} |
| 34 | + |
| 35 | +// 辅助函数:安全提取属性 |
| 36 | +function safeExtractAttribute($element: cheerio.Cheerio<any>, selector: string, attribute: string): string { |
| 37 | + return $element.find(selector).first().attr(attribute) || "" |
| 38 | +} |
| 39 | + |
| 40 | +// 辅助函数:格式化URL |
| 41 | +function formatUrl(url: string | undefined, baseUrl: string = "https://www.freebuf.com"): string { |
| 42 | + if (!url) return "" |
| 43 | + return url.startsWith("http") ? url : `${baseUrl}${url}` |
| 44 | +} |
| 45 | + |
| 46 | +// 辅助函数:提取统计信息 |
| 47 | +function extractStats($article: cheerio.Cheerio<any>): ArticleStats { |
| 48 | + const stats: ArticleStats = { views: 0, collections: 0 } |
| 49 | + |
| 50 | + // 提取围观数 |
| 51 | + const viewElement = $article.find("a:contains(\"围观\")") |
| 52 | + if (viewElement.length) { |
| 53 | + const viewText = viewElement.find("span").first().text() |
| 54 | + stats.views = Number.parseInt(viewText) || 0 |
| 55 | + } |
| 56 | + |
| 57 | + // 提取收藏数 |
| 58 | + const collectElement = $article.find("a:contains(\"收藏\")") |
| 59 | + if (collectElement.length) { |
| 60 | + const collectText = collectElement.find("span").first().text() |
| 61 | + stats.collections = Number.parseInt(collectText) || 0 |
| 62 | + } |
| 63 | + |
| 64 | + return stats |
| 65 | +} |
| 66 | + |
| 67 | +// 辅助函数:提取作者信息 |
| 68 | +function extractAuthor($article: cheerio.Cheerio<any>): AuthorInfo { |
| 69 | + const author: AuthorInfo = { name: "" } |
| 70 | + |
| 71 | + const authorLink = $article.find(".item-bottom a").first() |
| 72 | + if (authorLink.length) { |
| 73 | + author.name = authorLink.find("span").last().text().trim() |
| 74 | + author.profileUrl = formatUrl(authorLink.attr("href")) |
| 75 | + |
| 76 | + const avatarImg = authorLink.find(".ant-avatar img") |
| 77 | + if (avatarImg.length) { |
| 78 | + author.avatar = avatarImg.attr("src") |
| 79 | + } |
| 80 | + } |
| 81 | + |
| 82 | + return author |
| 83 | +} |
| 84 | + |
| 85 | +// 辅助函数:提取分类信息 |
| 86 | +function extractCategory($article: cheerio.Cheerio<any>): string { |
| 87 | + // 从URL路径推断分类 |
| 88 | + const articleUrl = $article.find(".title-left .title").parent().attr("href") || "" |
| 89 | + if (articleUrl.includes("/articles/web/")) return "Web安全" |
| 90 | + if (articleUrl.includes("/articles/database/")) return "数据安全" |
| 91 | + if (articleUrl.includes("/articles/network/")) return "网络安全" |
| 92 | + if (articleUrl.includes("/articles/mobile/")) return "移动安全" |
| 93 | + if (articleUrl.includes("/articles/cloud/")) return "云安全" |
| 94 | + |
| 95 | + return "" |
| 96 | +} |
| 97 | + |
| 98 | +// 通过截取freebuf的文章url获取新闻id |
| 99 | +function extractIdFromUrl(url: string): string { |
| 100 | + // 找到最后一个斜杠 |
| 101 | + const lastPart = url.slice(url.lastIndexOf("/") + 1) // "460614.html" |
| 102 | + // 去掉 .html,只保留数字 |
| 103 | + const match = lastPart.match(/\d+/) |
| 104 | + return match ? match[0] : "" |
| 105 | +} |
| 106 | + |
| 107 | +export default defineSource(async () => { |
| 108 | + const baseUrl = "https://www.freebuf.com" |
| 109 | + const html = await myFetch<any>(baseUrl, { |
| 110 | + headers: { |
| 111 | + "User-Agent": |
| 112 | + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", |
| 113 | + "Referer": "https://www.freebuf.com/", |
| 114 | + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", |
| 115 | + }, |
| 116 | + }) |
| 117 | + const $ = cheerio.load(html) |
| 118 | + const articles: ArticleData[] = [] |
| 119 | + // 遍历每个文章项 |
| 120 | + $(".article-item").each((index: number, articleElement) => { |
| 121 | + const $article = $(articleElement) |
| 122 | + |
| 123 | + try { |
| 124 | + // 提取文章标题和URL |
| 125 | + const titleLink = $article.find(".title-left .title").parent() |
| 126 | + const title = titleLink.find(".title").text().trim() |
| 127 | + const url = formatUrl(titleLink.attr("href"), baseUrl) |
| 128 | + |
| 129 | + // 如果标题为空,跳过此项 |
| 130 | + if (!title) return |
| 131 | + |
| 132 | + // 提取文章描述 |
| 133 | + const description = safeExtract($article, ".item-right .text-line-2") |
| 134 | + |
| 135 | + // 提取发布时间 |
| 136 | + const publishTime = safeExtract($article, ".item-bottom span:last-child") |
| 137 | + |
| 138 | + // 提取作者信息 |
| 139 | + const author = extractAuthor($article) |
| 140 | + |
| 141 | + // 提取统计信息 |
| 142 | + const stats = extractStats($article) |
| 143 | + |
| 144 | + // 提取专辑信息 |
| 145 | + const album = safeExtract($article, ".from-column span") |
| 146 | + |
| 147 | + // 提取图片 |
| 148 | + const image = safeExtractAttribute($article, ".img-view img", "src") |
| 149 | + |
| 150 | + // 提取分类 |
| 151 | + const category = extractCategory($article) |
| 152 | + |
| 153 | + // 构建完整的文章对象 |
| 154 | + const article: ArticleData = { |
| 155 | + title, |
| 156 | + url, |
| 157 | + description, |
| 158 | + publishTime, |
| 159 | + author, |
| 160 | + stats, |
| 161 | + album: album || undefined, |
| 162 | + image: image || undefined, |
| 163 | + category: category || undefined, |
| 164 | + } |
| 165 | + |
| 166 | + articles.push(article) |
| 167 | + } catch (error) { |
| 168 | + console.warn(`解析第${index + 1}篇文章时出错:`, error instanceof Error ? error.message : String(error)) |
| 169 | + } |
| 170 | + }) |
| 171 | + // 转换数据格式 |
| 172 | + return articles.map(item => ({ |
| 173 | + id: extractIdFromUrl(item.url), |
| 174 | + title: item.title, |
| 175 | + url: item.url, |
| 176 | + extra: { |
| 177 | + hover: item.description, |
| 178 | + time: item.publishTime, |
| 179 | + author: item.author, |
| 180 | + stats: item.stats, |
| 181 | + album: item.album, |
| 182 | + }, |
| 183 | + })) |
| 184 | +}) |
0 commit comments