如何使用包含属性的特定标签从网站抓取?

时间:2018-12-10 16:52:52

标签: javascript node.js mongodb express

我遇到麻烦的代码区域位于get(“ /”)路由中。我正试图从NYT网站上抓取。我正在尝试记录标题和链接。当我这样做时,我得到了两次记录的文章和一堆未定义的文章。我正在尝试包含if语句,该语句仅使用属性刮取H2标签。

var express = require("express");
var exphbs = require("express-handlebars");
var cheerio = require("cheerio");
var axios = require("axios");
var mongoose = require("mongoose");

var app = express();
var PORT = process.env.PORT || 3000;

var db = require("./models/Article");
//ignore before publishing
mongoose.connect(
"mongodb://localhost/scrapper",
{ useNewUrlParser: true }
);

//works with express for template engine"

app.engine("handlebars", exphbs({ defaultLayout: "main" }));
app.set("view engine", "handlebars");
app.use(express.static("public"));
 app.use(express.urlencoded({ extended: true }));
app.use(express.json());
app.set("port", process.env.PORT || 3000);

  app.get("/", function(req, res) {
   axios
  .get("https://www.nytimes.com/section/technology")
  .then(function(response) {
  var $ = cheerio.load(response.data);
  var result = {};
  var title = $(element)
    .find("h2.headline")
    .text()
    .trim();
  var link = $(element)
    .find("a")
    .attr("href");
  if (title && link) {
    result.title = $(element)
      .find("h2.headline")
      .text()
      .trim();
    result.link = $(element)
      .find("a")
      .attr("href");
    console.log(result.title);
    console.log("https://www.nytimes.com/section/technology" + 
    result.link);
    }
   });
   res.render("index");
  });

  app.use(function(req, res) {
 res.type("text/plain");
  res.status(404);
  res.send("404 - not found");
  });

  var MONGODB_URI =
  process.env.MONGODB_URI || "mongodb://localhost/mongoHeadlines";
  mongoose.connect(MONGODB_URI);

 app.listen(PORT, function() {
 console.log("Express Server Started on http://localhost:3000");
  });

0 个答案:

没有答案
相关问题