2018-09-29

抓取数据


const request=require('request-promise');
const cheerio=require('cheerio');
const debug=require('debug')('juejin:task:read');

1.1 获取标签列表 [#](#t11.1 获取标签列表)

exports.tagList=async function (uri) {
    debug('读取文章标签列表');
    let options={
        uri,
        transform: function (body) {
            return cheerio.load(body);
        }
    }
    return request(options).then($ => {
            let tags= [];
            $('.item').each((i,item) => {
                let tag=$(item);
                let image=tag.find('div.thumb').first();
                let title=tag.find('.title').first();
                let subscribe=tag.find('.subscribe').first();
                let article=tag.find('.article').first();
                let name=title.text().trim();
                tags.push({
                    image: image.data('src').trim(),
                    name,
                    url:`https://juejin.im/tag/${encodeURIComponent(title.text().trim())}`,
                    subscribe: Number(subscribe.text().match(/(\d+)/)[1]),
                    article:Number(article.text().match(/(\d+)/)[1])
                });
                debug(`读取文章标签:${name}`);
            });
           return tags.slice(0,1);
    });
}

1.2.文章列表 #

exports.articleList=async function (uri) {
    debug('读取博文列表');
    let options={
        uri,
        transform: function (body) {
            return cheerio.load(body);
        }
    }
    return request(options).then(async $ => {
        let articleList=[];
        let items =$('.item .title');
        for (let i=0;i<items.length;i++) {
            let article=$(items[i]);
            let href = article.attr('href').trim();
            let title=article.text().trim();
            let id=href.match(/\/(\w+)$/)[1];
            href='https://juejin.im'+href;
            let articleDetail = await readArticle(id,href);
            articleList.push({
                href,
                title,
                id,
                content:articleDetail.content,
                tags:articleDetail.tags
            });
            debug(`读取文章列表:${title}`);
        }
        return articleList;
    });
}

1.3.文章详情 #

async function readArticle(id,uri) {
    debug('读取博文');
    let options={
        uri,
        transform: function (body) {
            return cheerio.load(body);
        }
    }
    return request(options).then($ => {
        let article=$('.main-container');
        let title=article.find('h1').text().trim();
        let content=article.find('.article-content').html();
        let tags=article.find('.tag-list-box>div.tag-list>a.item');
        tags=tags.map((index,item) => {
            let href = $(item).attr('href');
            return href? href.slice(4):href;
        })
        tags=Array.prototype.slice.call(tags);
        debug(`读取文章详情:${title}`);
        return {
            id,
            title,
            content,
            tags
        };
    });
}

2 表结构 [#](#t42 表结构)

2.1 tag(标签表) [#](#t52.1 tag(标签表))

字段 类型 说明
id int(11) 标签名称
name varchar(255) 标签名称
image varchar(255) 标签图片
url varchar(255) url地址
subscribe int(11) 订阅数
article int(11) 文章数
+-----------+--------------+------+-----+---------+----------------+
| Field     | Type         | Null | Key | Default | Extra          |
+-----------+--------------+------+-----+---------+----------------+
| id        | int(11)      | NO   | PRI | NULL    | auto_increment |
| name      | varchar(255) | NO   |     | NULL    |                |
| image     | varchar(255) | NO   |     | NULL    |                |
| url       | varchar(255) | NO   |     | NULL    |                |
| subscribe | int(11)      | YES  |     | NULL    |                |
| article   | int(11)      | YES  |     | NULL    |                |
+-----------+--------------+------+-----+---------+----------------+

2.2 articles(文章表) [#](#t62.2 articles(文章表))

字段 类型 说明
id varchar(255) 文章ID
title varchar(255) 文章名称
href varchar(255) 文章连接
content longtext 文章内容
+---------+--------------+------+-----+---------+-------+
| Field   | Type         | Null | Key | Default | Extra |
+---------+--------------+------+-----+---------+-------+
| id      | varchar(255) | NO   | PRI | NULL    |       |
| title   | varchar(255) | NO   |     | NULL    |       |
| content | longtext     | YES  |     | NULL    |       |
| href    | varchar(255) | YES  |     | NULL    |       |
+---------+--------------+------+-----+---------+-------+

2.3 article_tag(文章标签表) [#](#t72.3 article_tag(文章标签表))

字段 类型 说明
article_id varchar(255) 文章ID
tag_id int(11) 标签ID
+------------+--------------+------+-----+---------+-------+
| Field      | Type         | Null | Key | Default | Extra |
+------------+--------------+------+-----+---------+-------+
| article_id | varchar(255) | NO   | PRI | NULL    |       |
| tag_id     | int(11)      | NO   | PRI | NULL    |       |
+------------+--------------+------+-----+---------+-------+

3. 写入数据库 [#](#t83. 写入数据库)

db.js

const mysql=require('mysql');
var Promise = require('bluebird');
const connection = mysql.createConnection({
    host:            '127.0.0.1',   // 数据库地址
    port:            3306,          // 数据库端口
    database:        'juejin',   // 数据库名称
    user:            'root',        // 数据库用户
    password:        ''             // 数据库用户对应的密码
});
connection.connect();
module.exports={
    query:Promise.promisify(connection.query).bind(connection),
    end:connection.end
}

crawl/task/write.js

const {query,end}=require('../db');
const debug=require('debug')('juejin:task:write');

3.1 写入标签 [#](#t93.1 写入标签)

exports.tagList=async function (tagList) {
    debug('保存文章标签列表');
    for (tag of tagList) {
        let oldTags=await query(`SELECT 1 FROM tags WHERE name=? LIMIT 1 `,[tag.name]);
        if (Array.isArray(oldTags)&&oldTags.length>0) {
            let oldTag=oldTags[0];
            await query(`UPDATE tags SET name=?,image=?,url=? WHERE id=?`,[tag.name,tag.image,tag.url,oldTag.id]);
        } else {
            await query(`INSERT INTO tags(name,image,url) VALUES(?,?,?)`,[tag.name,tag.image,tag.url]);
        }
    }
}

3.2 写入文章 [#](#t103.2 写入文章)

exports.articleList=async function (articleList) {
    debug('写入博文列表');
    debugger;
    for (article of articleList) {
        let oldArticles = await  query(`SELECT 1 FROM articles WHERE id=? LIMIT 1 `,article.id);
        if (Array.isArray(oldArticles)&&oldArticles.length>0) {
            let oldArticle=oldArticles[0];
            await query(`UPDATE articles SET title=?,content=?,href=? WHERE id=?`,[article.title,article.content,article.href,oldArticle.id]);
        } else {
            await query(`INSERT INTO articles(id,title,href,content) VALUES(?,?,?,?)`,[article.id,article.title,article.href,article.content]);
        }
        await query(`DELETE FROM article_tag WHERE article_id=? `,[article.id]);
        const where="('"+article.tags.join("','")+"')";
        const sql=`SELECT id FROM tags WHERE name IN ${where}`;
        let tagIds = await query(sql);
        for (row of tagIds) {
            await query(`INSERT INTO article_tag(article_id,tag_id) VALUES(?,?)`,[article.id,row.id]);
        }
    }
}

4. 建立web服务器查看数据 [#](#t114. 建立web服务器查看数据)

let express=require('express');
const path=require('path');
const {query}=require('../db');
const cronJob=require('cron').CronJob;
const debug=require('debug')('crawl:server');
const {spawn}=require('child_process');
let app=express();
app.set('view engine','html');
app.set('views',path.resolve('views'));
app.engine('html',require('ejs').__express);
app.get('/',async function (req,res) {
    let {tagId}=req.query;
    let tags=await query(`SELECT * FROM tags`);
    tagId=tagId||tags[0].id;
    let articles=await query(`SELECT a.* from articles a inner join article_tag  t on a.id = t.article_id WHERE t.tag_id =? `,[tagId]);
    res.render('index',{
        tags,articles
    });
});
app.get('/detail/:id',async function (req,res) {
    let id=req.params.id;
    let articles = await query(`SELECT * FROM articles WHERE id=? `,[id]);
    res.render('detail',{article:articles[0]});
});
app.listen(8080);
let job=new CronJob('*/5 * * * *',function () {
    debug('开始执行定时任务');
    let update= spawn(process.execPath,[path.resolve(__dirname,'update/index.js')]);
    update.stdout.pipe(process.stdout);
    update.stderr.pipe(process.stderr);
    updaste.on('close',function (code) {
        console.log('更新任务,代码=%d',code);
    });
});
job.start();

process.on('uncaughtException',function (err) {
    console.error('uncaughtException: %s',erro.stack);
});
<%- include header.html%>
<div class="container">
          <div class="row">
          <div class="col-md-2">
            <ul class="list-group">
               <%tags.forEach(tag=>{%>
                   <li class="list-group-item text-center">
                        <a href="/?tagId=<%=tag.id%>">
                        <img style="width:25px;height:25px;" src="<%=tag.image%>"/>
                        <%=tag.name%>
                    </a>
                  </li>
               <%})%>
            </ul>
          </div>
          <div class="col-md-10">
              <ul class="list-group">
               <%articles.forEach(article=>{%>
                   <li class="list-group-item">
                        <a href="/detail/<%=article.id%>">
                        <%=article.title%>
                    </a>
                  </li>
               <%})%>
            </ul>
          </div>
        </div>
    </div>
<%- include footer.html%>
<%- include header.html%>
    <div class="container">
          <div class="row">
          <div class="col-md-12">
              <div class="panel">
              <div class="panel-heading">
                  <h1 class="text-center"><%- article.title%></h1>
              </div>
              <div class="panel-body">
                  <%- article.content%>
              </div>
            <div>
          </div>
        </div>
    </div>
<%- include footer.html%>
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <link rel="stylesheet" href="https://cdn.bootcss.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
    <title>博客列表</title>
</head>
<body>
<nav class="navbar navbar-default">
  <div class="container-fluid">
    <!-- Brand and toggle get grouped for better mobile display -->
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1" aria-expanded="false">
        <span class="sr-only">Toggle navigation</span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="#">博客列表</a>
    </div>

    <!-- Collect the nav links, forms, and other content for toggling -->
    <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
      <ul class="nav navbar-nav">
        <li><a href="/">首页</a></li>
      </ul>
    </div><!-- /.navbar-collapse -->
  </div><!-- /.container-fluid -->
</nav>
</body>
</html>

Gitalking ...

Markdown is supported

Be the first guy leaving a comment!