抓取数据
const request=require('request-promise');
const cheerio=require('cheerio');
const debug=require('debug')('juejin:task:read');
1.1 获取标签列表 [#](#t11.1 获取标签列表)
exports.tagList=async function (uri) {
debug('读取文章标签列表');
let options={
uri,
transform: function (body) {
return cheerio.load(body);
}
}
return request(options).then($ => {
let tags= [];
$('.item').each((i,item) => {
let tag=$(item);
let image=tag.find('div.thumb').first();
let title=tag.find('.title').first();
let subscribe=tag.find('.subscribe').first();
let article=tag.find('.article').first();
let name=title.text().trim();
tags.push({
image: image.data('src').trim(),
name,
url:`https://juejin.im/tag/${encodeURIComponent(title.text().trim())}`,
subscribe: Number(subscribe.text().match(/(\d+)/)[1]),
article:Number(article.text().match(/(\d+)/)[1])
});
debug(`读取文章标签:${name}`);
});
return tags.slice(0,1);
});
}
#
1.2.文章列表exports.articleList=async function (uri) {
debug('读取博文列表');
let options={
uri,
transform: function (body) {
return cheerio.load(body);
}
}
return request(options).then(async $ => {
let articleList=[];
let items =$('.item .title');
for (let i=0;i<items.length;i++) {
let article=$(items[i]);
let href = article.attr('href').trim();
let title=article.text().trim();
let id=href.match(/\/(\w+)$/)[1];
href='https://juejin.im'+href;
let articleDetail = await readArticle(id,href);
articleList.push({
href,
title,
id,
content:articleDetail.content,
tags:articleDetail.tags
});
debug(`读取文章列表:${title}`);
}
return articleList;
});
}
#
1.3.文章详情async function readArticle(id,uri) {
debug('读取博文');
let options={
uri,
transform: function (body) {
return cheerio.load(body);
}
}
return request(options).then($ => {
let article=$('.main-container');
let title=article.find('h1').text().trim();
let content=article.find('.article-content').html();
let tags=article.find('.tag-list-box>div.tag-list>a.item');
tags=tags.map((index,item) => {
let href = $(item).attr('href');
return href? href.slice(4):href;
})
tags=Array.prototype.slice.call(tags);
debug(`读取文章详情:${title}`);
return {
id,
title,
content,
tags
};
});
}
2 表结构 [#](#t42 表结构)
2.1 tag(标签表) [#](#t52.1 tag(标签表))
字段 | 类型 | 说明 |
---|---|---|
id | int(11) | 标签名称 |
name | varchar(255) | 标签名称 |
image | varchar(255) | 标签图片 |
url | varchar(255) | url地址 |
subscribe | int(11) | 订阅数 |
article | int(11) | 文章数 |
+-----------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+-----------+--------------+------+-----+---------+----------------+
| id | int(11) | NO | PRI | NULL | auto_increment |
| name | varchar(255) | NO | | NULL | |
| image | varchar(255) | NO | | NULL | |
| url | varchar(255) | NO | | NULL | |
| subscribe | int(11) | YES | | NULL | |
| article | int(11) | YES | | NULL | |
+-----------+--------------+------+-----+---------+----------------+
2.2 articles(文章表) [#](#t62.2 articles(文章表))
字段 | 类型 | 说明 |
---|---|---|
id | varchar(255) | 文章ID |
title | varchar(255) | 文章名称 |
href | varchar(255) | 文章连接 |
content | longtext | 文章内容 |
+---------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+---------+--------------+------+-----+---------+-------+
| id | varchar(255) | NO | PRI | NULL | |
| title | varchar(255) | NO | | NULL | |
| content | longtext | YES | | NULL | |
| href | varchar(255) | YES | | NULL | |
+---------+--------------+------+-----+---------+-------+
2.3 article_tag(文章标签表) [#](#t72.3 article_tag(文章标签表))
字段 | 类型 | 说明 |
---|---|---|
article_id | varchar(255) | 文章ID |
tag_id | int(11) | 标签ID |
+------------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+------------+--------------+------+-----+---------+-------+
| article_id | varchar(255) | NO | PRI | NULL | |
| tag_id | int(11) | NO | PRI | NULL | |
+------------+--------------+------+-----+---------+-------+
3. 写入数据库 [#](#t83. 写入数据库)
db.js
const mysql=require('mysql');
var Promise = require('bluebird');
const connection = mysql.createConnection({
host: '127.0.0.1', // 数据库地址
port: 3306, // 数据库端口
database: 'juejin', // 数据库名称
user: 'root', // 数据库用户
password: '' // 数据库用户对应的密码
});
connection.connect();
module.exports={
query:Promise.promisify(connection.query).bind(connection),
end:connection.end
}
crawl/task/write.js
const {query,end}=require('../db');
const debug=require('debug')('juejin:task:write');
3.1 写入标签 [#](#t93.1 写入标签)
exports.tagList=async function (tagList) {
debug('保存文章标签列表');
for (tag of tagList) {
let oldTags=await query(`SELECT 1 FROM tags WHERE name=? LIMIT 1 `,[tag.name]);
if (Array.isArray(oldTags)&&oldTags.length>0) {
let oldTag=oldTags[0];
await query(`UPDATE tags SET name=?,image=?,url=? WHERE id=?`,[tag.name,tag.image,tag.url,oldTag.id]);
} else {
await query(`INSERT INTO tags(name,image,url) VALUES(?,?,?)`,[tag.name,tag.image,tag.url]);
}
}
}
3.2 写入文章 [#](#t103.2 写入文章)
exports.articleList=async function (articleList) {
debug('写入博文列表');
debugger;
for (article of articleList) {
let oldArticles = await query(`SELECT 1 FROM articles WHERE id=? LIMIT 1 `,article.id);
if (Array.isArray(oldArticles)&&oldArticles.length>0) {
let oldArticle=oldArticles[0];
await query(`UPDATE articles SET title=?,content=?,href=? WHERE id=?`,[article.title,article.content,article.href,oldArticle.id]);
} else {
await query(`INSERT INTO articles(id,title,href,content) VALUES(?,?,?,?)`,[article.id,article.title,article.href,article.content]);
}
await query(`DELETE FROM article_tag WHERE article_id=? `,[article.id]);
const where="('"+article.tags.join("','")+"')";
const sql=`SELECT id FROM tags WHERE name IN ${where}`;
let tagIds = await query(sql);
for (row of tagIds) {
await query(`INSERT INTO article_tag(article_id,tag_id) VALUES(?,?)`,[article.id,row.id]);
}
}
}
4. 建立web服务器查看数据 [#](#t114. 建立web服务器查看数据)
let express=require('express');
const path=require('path');
const {query}=require('../db');
const cronJob=require('cron').CronJob;
const debug=require('debug')('crawl:server');
const {spawn}=require('child_process');
let app=express();
app.set('view engine','html');
app.set('views',path.resolve('views'));
app.engine('html',require('ejs').__express);
app.get('/',async function (req,res) {
let {tagId}=req.query;
let tags=await query(`SELECT * FROM tags`);
tagId=tagId||tags[0].id;
let articles=await query(`SELECT a.* from articles a inner join article_tag t on a.id = t.article_id WHERE t.tag_id =? `,[tagId]);
res.render('index',{
tags,articles
});
});
app.get('/detail/:id',async function (req,res) {
let id=req.params.id;
let articles = await query(`SELECT * FROM articles WHERE id=? `,[id]);
res.render('detail',{article:articles[0]});
});
app.listen(8080);
let job=new CronJob('*/5 * * * *',function () {
debug('开始执行定时任务');
let update= spawn(process.execPath,[path.resolve(__dirname,'update/index.js')]);
update.stdout.pipe(process.stdout);
update.stderr.pipe(process.stderr);
updaste.on('close',function (code) {
console.log('更新任务,代码=%d',code);
});
});
job.start();
process.on('uncaughtException',function (err) {
console.error('uncaughtException: %s',erro.stack);
});
<%- include header.html%>
<div class="container">
<div class="row">
<div class="col-md-2">
<ul class="list-group">
<%tags.forEach(tag=>{%>
<li class="list-group-item text-center">
<a href="/?tagId=<%=tag.id%>">
<img style="width:25px;height:25px;" src="<%=tag.image%>"/>
<%=tag.name%>
</a>
</li>
<%})%>
</ul>
</div>
<div class="col-md-10">
<ul class="list-group">
<%articles.forEach(article=>{%>
<li class="list-group-item">
<a href="/detail/<%=article.id%>">
<%=article.title%>
</a>
</li>
<%})%>
</ul>
</div>
</div>
</div>
<%- include footer.html%>
<%- include header.html%>
<div class="container">
<div class="row">
<div class="col-md-12">
<div class="panel">
<div class="panel-heading">
<h1 class="text-center"><%- article.title%></h1>
</div>
<div class="panel-body">
<%- article.content%>
</div>
<div>
</div>
</div>
</div>
<%- include footer.html%>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<link rel="stylesheet" href="https://cdn.bootcss.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
<title>博客列表</title>
</head>
<body>
<nav class="navbar navbar-default">
<div class="container-fluid">
<!-- Brand and toggle get grouped for better mobile display -->
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1" aria-expanded="false">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a class="navbar-brand" href="#">博客列表</a>
</div>
<!-- Collect the nav links, forms, and other content for toggling -->
<div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
<ul class="nav navbar-nav">
<li><a href="/">首页</a></li>
</ul>
</div><!-- /.navbar-collapse -->
</div><!-- /.container-fluid -->
</nav>
</body>
</html>
Gitalking ...
Be the first guy leaving a comment!