我有一个场景,需要从网站获取所有社交媒体链接。如果我认为所有社交链接都在主页上,那么每个社交链接都将获取每个社交媒体链接。下面是代码示例:
使用cheeriojs
const cheerio = require('cheerio')
const axios = require('axios')
const https = require('https');
const agent = new https.Agent({
rejectUnauthorized: false
});
// process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
const getLinks = async (url) => {
try {
let body = await axios.get(url, { httpsAgent: agent })
let hrefs = []
let $ = cheerio.load(body.data)
let links = $('a')
links.each((i, link) => {
hrefs.push($(link).attr('href'))
})
return hrefs
} catch (error) {
return error
}
}
const getSocialLinks = async (socialLinks, url) => {
try {
let hrefs = await getLinks(url)
let handles = []
hrefs.filter(href => {
if (href) {
for (const link of socialLinks) {
if (href.includes(link)) {
handles.push({ platform: link, handle: href })
break
}
}
}
})
console.log(handles);
} catch (error) {
console.log(error)
}
}
getSocialLinks(['facebook', 'twitter', 'instagram', 'youtube', 'linkedin'], 'https://synavos.com')
如果所有社交媒体链接都在主页上,则效果很好,但是 我无法弄清楚社交媒体链接是否打开 给定网站的其他页面。
以下是使用 puppeteer 的相同代码示例:
const puppeteer = require('puppeteer')
const getHrefsAttributes = async (website) => {
try {
const browser = await puppeteer.launch({
headless: true,
ignoreHTTPSErrors: true
});
const [page] = await browser.pages();
await page.goto(website, { waitUntil: 'networkidle2', timeout: 3000000 });
const hrefs = await page.evaluate(() => Array.from(document.querySelectorAll('a[href]'), a => a.getAttribute('href')));
await browser.close();
return hrefs
} catch (err) {
console.error(err);
}
}
const getSocialLinks = async (url, socialLinks) => {
let hrefs = await getHrefsAttributes(url)
// add array for social links which you want to fetch
let handles = []
hrefs.filter(href => {
for (const link of socialLinks) {
if (href.includes(link)) {
handles.push({ platform: link, handle: href })
break
}
}
})
console.log(handles);
}
getSocialLinks('https://synavos.com/', ['facebook', 'twitter', 'instagram', 'youtube', 'linkedin'])
例如,此URL https://netsoltech.com/在主页上没有其社交媒体链接。
答案 0 :(得分:1)
您可以使用堆栈/递归 (depth-first) 或队列 (breadth-first) 并运行搜索到一定深度,保留一组访问过的 URL 以避免循环。
一个好的设计可能会使用 generator,因此您可以无限期地继续搜索,直到您达到一定深度或找到一定数量的结果等。这为调用者提供了更大的灵活性,但会增加冗长.
您可能希望调整 href 检查或 URL 以确保您始终在基本站点上并且不会从其他域中提取链接,但基本思想是相同的。
还要注意,这纯粹是顺序的,因此速度很慢。您可能希望使用 asynchronous task queue 并行化请求。
const axios = require("axios");
const cheerio = require("cheerio");
const headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
};
async function *findLinksRecursively(baseUrl, maxDepth) {
const visited = new Set(baseUrl);
for (const queue = [{url: baseUrl, depth: 0}]; queue.length;) {
const {url, depth} = queue.shift();
if (depth < maxDepth && !visited.has(url)) {
visited.add(url);
const {data} = await axios.get(url, {headers})
.catch(() => ({data: {}}))
;
const $ = cheerio.load(data);
const links = [...$('a[href]:not(a[href^="#"])')
.map((i, e) => e.attribs["href"])
].map(e => e.startsWith("http") ? e : baseUrl + e);
queue.push(...links.map(e => ({url: e, depth: depth + 1})));
yield links;
}
}
}
(async () => {
const baseUrl = "https://netsoltech.com";
const socialKeywords = [
"facebook", "twitter", "instagram", "youtube", "linkedin"
];
const links = new Set();
for (const gen = findLinksRecursively(baseUrl, 2);;) {
const {value, done} = await gen.next();
if (done) break;
value.forEach(e => links.add(e));
}
const socialLinks = [...links]
.filter(e => socialKeywords.find(x => e.includes(x)))
;
console.log(socialLinks);
})();
Puppeteer 中的概念几乎相同,只是您使用 page.goto
代替 Axios 和 page.$eval
代替 Cheerio。