使用MongoDB创建源数据的通讯简报
有很多可供选择的新闻发布网站,比如ndtv.com。在这篇文章中,让我们看看如何通过提取功能(scraping feature)从ndtv.com获取数据,即从ndtv.com提取内容并将其存储到MongoDB中。MongoDB是一种NoSQL文档模型数据库。
使用Mongoose、Node.js和Cheerio,对NDTV新闻网站进行抓取,并将数据加载到MongoDB数据库中。这是一个使用MongoDB、Mongoose、Node.js、Express.js、Handlebars.js、HTML和CSS构建的全栈JavaScript应用程序。它抓取NDTV的首页并存储文章标题和链接。
模块安装: 使用以下命令安装所需的模块。
npm install body-parser
npm install cheerio
npm install express
npm install express-handlebars
npm install mongoose
npm install request
项目结构: 它看起来是这样的。
实现:
文件名:server.js: 这是启动应用程序的重要文件。用于调用ndtv网站,解析数据,并将其存储在MongoDB数据库中。
JavaScript
// First specifying the required dependencies
// Express is a minimal and flexible Node.js
// web application framework that provides a
// robust set of features for web and mobile
// applications
const express = require("express");
// To communicate with mongodb, we require "mongoose"
const mongoose = require("mongoose");
// As we need to call ndtv website and access
// the urls, we require "request"
const request = require("request");
// Cheerio parses markup and provides an
// API for traversing/manipulating the
// resulting data structure
const cheerio = require("cheerio");
// Node.js body parsing middleware.
// Parse incoming request bodies in a
// middleware before your handlers,
// available under the req.body property.
const bodyParser = require("body-parser");
const exphbs = require("express-handlebars");
// We can explicitly set the port number
// provided no other instances running
// on that port
const PORT = process.env.PORT || 3000;
// Initialize Express
const app = express();
// Use body-parser for handling form submissions
app.use(bodyParser.urlencoded({
extended: false
}));
// We are getting the output in the
// form of application/json
app.use(bodyParser.json({
type: "application/json"
}));
// Serve the public directory
app.use(express.static("public"));
// Use promises with Mongo and connect to
// the database
// Let us have our mongodb database name
// to be ndtvnews By using Promise,
// Mongoose async operations, like .save()
// and queries, return thenables.
mongoose.Promise = Promise;
const MONGODB_URI = process.env.MONGODB_URI
|| "mongodb://localhost/ndtvnews";
mongoose.connect(MONGODB_URI);
// Use handlebars
app.engine("handlebars", exphbs({
defaultLayout: "main"
}));
app.set("view engine", "handlebars");
// Hook mongojs configuration to the db variable
const db = require("./models");
// We need to filter out NdtvArticles from
// the database that are not saved
// It will be called on startup of url
app.get("/", function (req, res) {
db.Article.find({
saved: false
},
function (error, dbArticle) {
if (error) {
console.log(error);
} else {
// We are passing the contents
// to index.handlebars
res.render("index", {
articles: dbArticle
});
}
})
})
// Use cheerio to scrape stories from NDTV
// and store them
// We need to do this on one time basis each day
app.get("/scrape", function (req, res) {
request("https://ndtv.com/", function (error, response, html) {
// Load the html body from request into cheerio
const = cheerio.load(html);
// By inspecting the web page we know how to get the
// title i.e. headlines of news.
// From view page source also we can able to get it.
// It differs in each web page
("h2").each(function (i, element) {
// The trim() removes whitespace because the
// items return \n and \t before and after the text
const title = (element).find("a").text().trim();
console.log("title", title);
const link =(element).find("a").attr("href");
console.log("link", link);
// If these are present in the scraped data,
// create an article in the database collection
if (title && link) {
db.Article.create({
title: title,
link: link
},
function (err, inserted) {
if (err) {
// Log the error if one is
// encountered during the query
console.log(err);
} else {
// Otherwise, log the inserted data
console.log(inserted);
}
});
// If there are 10 articles, then
// return callback to the frontend
console.log(i);
if (i === 10) {
return res.sendStatus(200);
}
}
});
});
});
// Route for retrieving all the saved articles.
// User has the option to save the article.
// Once it is saved, "saved" column in the
// collection is set to true.
// Below routine helps to find the articles
// that are saved
app.get("/saved", function (req, res) {
db.Article.find({
saved: true
})
.then(function (dbArticle) {
// If successful, then render with
// the handlebars saved page
// this time saved.handlebars is
// called and that page is rendered
res.render("saved", {
articles: dbArticle
})
})
.catch(function (err) {
// If an error occurs, send the
// error back to the client
res.json(err);
})
});
// Route for setting an article to saved
// In order to save an article, this routine is used.
// _id column in collection is unique and it will
// determine the uniqueness of the news
app.put("/saved/:id", function (req, res) {
db.Article.findByIdAndUpdate(
req.params.id, {
set: req.body
}, {
new: true
})
.then(function (dbArticle) {
// This time saved.handlebars is
// called and that page is rendered
res.render("saved", {
articles: dbArticle
})
})
.catch(function (err) {
res.json(err);
});
});
// Route for saving a new note to the db and
// associating it with an article
app.post("/submit/:id", function (req, res) {
db.Note.create(req.body)
.then(function (dbNote) {
let articleIdFromString =
mongoose.Types.ObjectId(req.params.id)
return db.Article.findByIdAndUpdate(
articleIdFromString, {
push: {
notes: dbNote._id
}
})
})
.then(function (dbArticle) {
res.json(dbNote);
})
.catch(function (err) {
// If an error occurs, send it
// back to the client
res.json(err);
});
});
// Route to find a note by ID
app.get("/notes/article/:id", function (req, res) {
db.Article.findOne({ "_id": req.params.id })
.populate("notes")
.exec(function (error, data) {
if (error) {
console.log(error);
} else {
res.json(data);
}
});
});
app.get("/notes/:id", function (req, res) {
db.Note.findOneAndRemove({ _id: req.params.id },
function (error, data) {
if (error) {
console.log(error);
}
res.json(data);
});
});
// Listen for the routes
app.listen(PORT, function () {
console.log("App is running");
});
运行应用程序的步骤: 使用以下命令运行 server.js 文件。
node server.js
输出: 我们将在终端屏幕上看到以下输出。
App is running
现在打开任意浏览器并转到 http://localhost:3000/ ,我们将会得到一个类似如下的页面。
要从ndtv.com获取新闻,我们需要点击 获取新文章 。这将在内部调用我们的/scrape路径。一旦完成此调用,在MongoDB中,ndtvnews数据库下的articles集合将填充以下数据:
在这里,最初保存的属性将是false,id会在MongoDB中自动生成,这是集合中文档的唯一标识。这个属性只能帮助查看文档、保存文档等等。
点击 在NDTV上查看文章 后,将会导航到相应的文章。这仅仅是因为articles集合中存在的id属性而变得可能。因此,当我们点击 在NDTV上查看文章 时,由于它是一个超链接,直接将那个文档的_id
值内部捕获,并显示链接。当点击 保存文章 时,_Id
值将是该文章的识别部分。
结论: 轻松简洁地抓取任何新闻网站并仅显示标题内容并附带进一步的链接,我们可以轻松保存文章并查看保存的文章。
参考链接: https://github.com/raj123raj/NdtvNewsScraperUsingMongoDB