此问题与node.js中的网址抓取工具有关。
在namespace m4
{
class Program
{
static void Main(string[] args)
{
do
{
Random r = new Random();
int Player1Choice = r.Next(4);
Console.WriteLine("Player 1 chose ");
string Player1Choice = Console.ReadLine();
Console.WriteLine("Player 2, do you choose rock,paper or scissors");
string Player2Choice = Console.ReadLine();
if (Player1Choice == "1")
{
if (Player2Choice == "rock")
{
Console.WriteLine("Both players chose rock");
Console.WriteLine("It is a tie ");
}
else if (Player2Choice == "paper")
{
Console.WriteLine("Player 1 chose rock, Player 2 chose paper");
Console.WriteLine("Player 2 wins!");
}
else if (Player2Choice == "scissors")
{
Console.WriteLine("Player 1 chose rock, Player 2 chose scissors");
Console.WriteLine("Player 1 wins! ");
}
else
{
Console.WriteLine("You must choose rock,paper or scissors!");
}
}
else if (Player1Choice == "2")
{
if (Player2Choice == "rock")
{
Console.WriteLine("Player 1 chose paper, Player 2 chose rock");
Console.WriteLine("Player 1 wins!");
}
else if (Player2Choice == "paper")
{
Console.WriteLine("Both players chose paper");
Console.WriteLine("It is a tie! ");
}
else if (Player2Choice == "scissors")
{
Console.WriteLine("Player 1 chose paper, Player 2 chose scissors");
Console.WriteLine("Player 2 wins!");
}
else
{
Console.WriteLine("You must choose rock,paper or scissors!");
}
}
else if (Player1Choice == "3")
{
if (Player2Choice == "rock")
{
Console.WriteLine("Player 1 chose scissors, Player 2 chose rock");
Console.WriteLine("Player 2 wins!");
}
else if (Player2Choice == "paper")
{
Console.WriteLine("Player 1 chose scissors, Player 2 chose paper");
Console.WriteLine("Player 1 wins!");
}
else if (Player2Choice == "scissors")
{
Console.WriteLine("Both players chose scissors");
Console.WriteLine("It is a tie!");
}
else
{
Console.WriteLine("You must choose rock,paper or scissors!");
}
Console.WriteLine("Would you like to play again?");
}
} while (Console.ReadLine() == "yes");
}
}
}
URL上,他查找链接并将它们“推送”到.json文件(output.json)。
如何确保他没有两次“推”或“写”域名到output.json(这样我就不会重复了)?我一直在使用哈希函数,但这导致了问题。
start_url
答案 0 :(得分:2)
您可以在Set
对象中跟踪以前看到过的域,如下所示:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var domainList = new Set();
var start_url = ["http://blog.codinghorror.com/"]
var wstream = fs.createWriteStream("output.json");
// Extract root domain name from string
function extractDomain(url) {
var domain;
if (url.indexOf("://") > -1) { //find & remove protocol (http(s), ftp, etc.) and get domain
domain = url.split('/')[2];
} else {
domain = url.split('/')[0];
}
domain = domain.split(':')[0]; //find & remove port number
// since domains are not case sensitive, canonicalize it by going to lowercase
return domain.toLowerCase();
}
var req = function(url){
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this).attr("href");
if (link) {
var makelinkplain = extractDomain(link);
// see if we've already done this domain
if (!domainList.has(makelinkplain)) {
domainList.add(makelinkplain);
start_url.push("http://" + makelinkplain);
wstream.write('"http://'+ makelinkplain + '",');
}
}
});
}
start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}
wstream.end();
});
}
req(start_url[0]);
注意:我还在.toLowerCase()
函数中添加了extractDomain()
,因为域不区分大小写,但是Set对象是。这将确保即使只有大小写不同的域被识别为同一个域。