那些10w+的公众号都在写什么？

出于好奇，那些10w+的公众号都写了些什么，于是我写了几个脚本爬取了各行业Top的公众号文章，进行了关键词统计。

抓取数据、分析用到了3中语言：Node.js，Java，Python。废话不多说，直接上代码。

1(NODEJS)

puppeteer模拟登陆，抓取微信公众号链接：

/**
* load wechat article urls on newrank.cn
**/
const puppeteer = require('puppeteer');
//emulate iphone
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36';
const workPath = './newrank_cn1111';
const fs = require("fs");
const userName = "公众号";
const ppwwdd = "caiyongji";
if (!fs.existsSync(workPath)) {
        fs.mkdirSync(workPath)
}
const loginUrl = 'https://www.newrank.cn/public/login/login.html?back=https%3A//www.newrank.cn/';

const monthlyRankUrl = "https://www.newrank.cn/public/info/list.html?period=month&type=data";

const detailUrl = "https://www.newrank.cn/public/info/detail.html?account=";

(async () => {

    const browser = await puppeteer.launch({headless: false});//set headless: true will hide chromium UI
    const page = await browser.newPage();
    await page.setUserAgent(userAgent);
    await page.setViewport({width:1920, height:1000});
    await page.setRequestInterception(true);

    //filter to block images
    page.on('request', request => {
    if (request.resourceType() === 'image')
      request.abort();
    else
      request.continue();
    });
    await page.goto(loginUrl);
    //login
    await loginOperate();
    //await page.close();

    await processMonthlyRank('.wx-right-type-list-spe a[icon=ss]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=mgs]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=cf]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=kj]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=cy]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=qc]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=ls]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=zc]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=jy]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=xs]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=zw]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=qy]');

    await processMonthlyRank('.wx-right-type-list-spe a[icon=wh]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=bk]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=jk]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=shs]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=ms]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=sj]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=lx]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=ym]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=qg]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=ty]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=mt]');
    await processMonthlyRank('.wx-right-type-list-spe a[icon=zs]');

    await processMonthlyRank('#wx_month_all');




    async function loginOperate(){
        try{
            await page.click('div[data-type=pwd]');
        }catch(err){
            console.log('login#1');
        }

        try{
            await page.type('#account_input',userName);
            await page.type('#password_input',ppwwdd);
        }catch(err){
            console.log('login#2');
        }

        try{
            await page.click('#pwd_confirm');
        }catch(err){
            console.log('login#3');
        }

    }

    async function processMonthlyRank(btn){
        const tab = await browser.newPage();
        await tab.setUserAgent(userAgent);
        await tab.setViewport({width:1920, height:1000});
        await tab.setRequestInterception(true);

        //filter to block images
        tab.on('request', request => {
        if (request.resourceType() === 'image')
          request.abort();
        else
          request.continue();
        });
        await tab.goto(monthlyRankUrl);
        try{
            await tab.click(btn);
        }catch(err){
            console.log('processMonthlyRank#1');
        }
        let fileName = await tab.evaluate(function(param){
            return document.querySelector(param).innerHTML;
        },btn);
        console.log('-------------------------'+fileName+'-------------------------');
        await scrollWait(tab);
        await waitSecond(tab);

        const sel = '.wx_main tr';
        const texts = await tab.evaluate((sel) => {
        let elements = Array.from(document.querySelectorAll(sel));
            let txt = elements.map(element => {
                return element.innerText
            })
            return txt;
        }, sel);
        console.log('total rows: '+texts.length);
        let contents='记录条数'+(texts.length-1)+'\n\n';
        texts.forEach(function(c,index){
            if(index>0){
                contents+=c+'\n\n';
            }
        });

        const fs = require("fs");
        fs.writeFileSync(workPath+'/'+fileName+'.txt',contents);
        console.log(fileName + " has been extracted to local.");

        const idSel = '.wx_main tr a[href^="detail.html"]';
        const ids = await tab.evaluate((idSel) => {
        let elements = Array.from(document.querySelectorAll(idSel));
            let txt = elements.map(element => {
                return element.innerText
            })
            return txt;
        }, idSel);
        let idContents='';
        let w_name;
        let flag =true;
        /*ids.forEach(async function(id,index){
            if(index%2!=0){
                idContents+=id+'\n';
                await getDetail(fileName,w_name,id);
                w_name =null;
            }else{
                w_name=id;
            }
        });*/
        await (async ()=>{
            for(let i=0;i<ids.length;i++){
                if(i%2!=0){
                idContents+=ids[i]+'\n';
                await getDetail(fileName,w_name,ids[i]);
                w_name =null;
            }else{
                w_name=ids[i];
            }
            }
        })();
        let idFile = 'id_'+fileName;
        fs.writeFileSync(workPath+'/'+idFile+'.txt',idContents);
        console.log(idFile + " has been extracted to local.");
        await tab.close();
    }

    async function scrollWait(p, n){
        if(n==null) n=5;
        for(let i= 0; i<n;i++){
            try{
                await p.evaluate(()=>window.scrollTo(0, document.body.scrollHeight));
                await p.waitForNavigation({timeout:500,waitUntil: ['networkidle0']});
            }catch(err){
                console.log('scroll to bottom and then wait 500 ms.');
            }
        }
    }

    async function waitSecond(p){
        try{
            await p.waitForNavigation({timeout:2000,waitUntil: ['networkidle0']});
        }catch(err){
            //console.log('wait 1 sec.');
        }
    }

    async function getDetail(cat,name,id){
        const tab = await browser.newPage();
        await tab.setUserAgent(userAgent);
        await tab.setViewport({width:1920, height:1000});
        await tab.setRequestInterception(true);

        //filter to block images
        tab.on('request', request => {
        if (request.resourceType() === 'image')
          request.abort();
        else
          request.continue();
        });
        await tab.goto(detailUrl+id);
        await waitSecond(tab);
        const sel = '#info_detail_article_top li .title a';
        const hrefs = await tab.evaluate((sel) => {
            let elements = Array.from(document.querySelectorAll(sel));
            let links = elements.map(element => {
                return element.href
            })
            return links;
        }, sel);
        let urlList='';
        hrefs.forEach(function(href,index){
            urlList+=href+"\n";
        });
        const fs = require("fs");
        if (!fs.existsSync(workPath+'/'+cat)) {
            fs.mkdirSync(workPath+'/'+cat)
        }
        fs.writeFileSync(workPath+'/'+cat+'/'+id+'_top_'+name+'.txt',urlList);

        const sel1 = '#info_detail_article_lastest li .title a';
        const hrefs1 = await tab.evaluate((sel1) => {
            let elements = Array.from(document.querySelectorAll(sel1));
            let links = elements.map(element => {
                return element.href
            })
            return links;
        }, sel1);
        let urlList1='';
        hrefs1.forEach(function(href,index){
            urlList1+=href+"\n";
        });
        fs.writeFileSync(workPath+'/'+cat+'/'+id+'_lastest_'+name+'.txt',urlList1);
        console.log(id+' '+name+' has been extracted to local.');
        await tab.close();
    }

})();

2（JAVA）

Jsoup抓取微信文章文本：Vps 安全设置 Win2003中IIS的安全设置技巧

package com;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadLocalRandom;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class WeChatUrls extends Thread {
    private File catFile;
    final static Integer ThreadNum = 1;
    final String ERROR = "ERROR";
    private final static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36";
    private final static String WORK_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn_articles";
    private final static String READ_URLS_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn";

    public WeChatUrls(File cat) {
        this.catFile = cat;
    }

    private String getUrlProxyContent(String url) {
        String body = ERROR;
        try {
            Document doc = Jsoup.connect(url).userAgent(USER_AGENT).get();
            if (doc.select("body") != null) {
                body = doc.select("body").text();
            }
        } catch (IOException e) {
            System.out.println("ERROR URL: " + url);
            e.printStackTrace();
        }

        return body;
    }

    private void write(String content, String fileName) {
        File f = new File(fileName);
        FileWriter fw = null;
        BufferedWriter bw = null;
        try {
            if (!f.exists()) {
                f.getParentFile().mkdirs();
                f.createNewFile();
            }
//             fw = new FileWriter(f.getAbsoluteFile(), true); // true表示可以追加新内容
            fw = new FileWriter(f.getAbsoluteFile()); // 表示不追加
            bw = new BufferedWriter(fw);
            bw.write(content);
            bw.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws Exception {
        File baseFolder = new File(READ_URLS_FOLDER);
        File[] cataFiles = baseFolder.listFiles();
        ExecutorService service = Executors.newFixedThreadPool(ThreadNum);
        Arrays.asList(cataFiles).stream().forEach(catFile -> {
            if (catFile.isFile() && catFile.getName().startsWith("id")) {
                service.execute(new WeChatUrls(catFile));
            }
        });
        service.shutdown();
    }

    private void process() {
//        Set<String> redoSet = new HashSet<>();
        String catagory = catFile.getName().split("\\.")[0].split("_")[1];
        File urlFolder = new File(READ_URLS_FOLDER + "\\" + catagory);
        File[] urlFiles = urlFolder.listFiles();
        if (urlFiles != null) {
            Arrays.asList(urlFiles).stream().forEach(urlFile -> {
                try {
                    BufferedReader reader = new BufferedReader(new FileReader(catFile));
                    String wechatId = null;
                    int countLatest = 1;
                    int countTop = 1;
                    while ((wechatId = reader.readLine()) != null) {
                        if (urlFile.getName().startsWith(wechatId)) {
                            String wechatName = urlFile.getName().split("\\.")[0].split("_")[2];
//                            if (urlFile.length() == 0) {
//                                redoSet.add("\"" + catagory + "\",\"" + wechatName + "\",\"" + wechatId + "\"");
//                            }
                            BufferedReader r = new BufferedReader(new FileReader(urlFile));
                            String wechatUrl = null;
                            while ((wechatUrl = r.readLine()) != null) {
                                String writePath = WORK_FOLDER + "\\" + catagory + "\\"
                                        + (urlFile.getName().contains("top") ? "top" : "latest") + "\\" + wechatId
                                        + "_" + wechatName + "_"
                                        + (urlFile.getName().contains("top") ? countTop++ : countLatest++)+".txt";
                                String content = getUrlProxyContent(wechatUrl);
                                write(content, writePath);
                                System.out.println(writePath);
                                Thread.sleep(ThreadLocalRandom.current().nextInt(500, 3000));
                            }
                            r.close();
                        }
                    }
                    reader.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            });
        }
//        redoSet.stream().forEach(System.out::println);

    }

    @Override
    public void run() {
        process();
    }
}

3（PYTHON）

wordcloud生成词云：

# -*- coding: utf-8 -*-
import json
import random
import time
import os
from pyecharts import Bar,Geo,Line,Overlap
import jieba
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from collections import Counter
os.chdir('T:/Developer/puppeteerTestCase/newrank_cn_articles')

stopWords = ['微信','二维码','二维','扫一','一扫','公众','赞赏','转账','关注','打开','阅读','图片','关闭','取消','程序']

def proc(folder, type):
    fileLines = []
    rootdir = './'+folder+'/'+type
    list = os.listdir(rootdir)
    for i in range(0,len(list)):
            path = os.path.join(rootdir,list[i])
            if os.path.isfile(path):
                    try:
                        fo = open(path, 'r+')
                        fileLines += fo.readlines()
                    except:
                        print('error while processing file: ' + path)

    _str =  ' '.join(fileLines)
    words_list = []
    word_generator = jieba.cut_for_search(_str)  
    for word in word_generator:
        words_list.append(word)
    words_list = [k for k in words_list if len(k)>1 and k not in stopWords]
    back_color = imread('back.jpg')
    wc = WordCloud(background_color='white',
                   max_words=2000,
                   mask=back_color,
                   max_font_size=300,
                   font_path="C:/Windows/Fonts/msyh.ttc",
                   random_state=42
                   )
    _count = Counter(words_list)
    wc.generate_from_frequencies(_count)
    image_colors = ImageColorGenerator(back_color)
    wc.recolor(color_func=image_colors)
    #plt.figure()
    #plt.imshow(wc.recolor(color_func=image_colors))
    #plt.axis('off')

    # The pil way (if you don't have matplotlib)
    image = wc.to_image()
    image.show()
    jpgFile = './'+type+'_'+folder+'.jpg'
    image.save(jpgFile)
    print('image File saved:' + jpgFile)





basedir = './'
baselist = os.listdir(basedir)
for l in range(0,len(baselist)):
        p = os.path.join(basedir,baselist[l])
        if os.path.isdir(p):
                proc(os.path.basename(p), 'top')