软工第四周:爬虫知识学习

这周学习了爬虫的相关知识,具体要求:从网页爬取全国疫情分布情况,读取入数据库结合图形化展示。

实现思路:利用jsoup得到网页数据,存入数据库。

package pachong;

import java.io.IOException;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Map;

import javax.naming.InitialContext;

import org.apache.commons.dbutils.QueryRunner;
import org.jsoup.Jsoup;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.mchange.v2.c3p0.DataSources;



//import utils.DataSourceUtils;

public class get_DBnumber {
    //定义几个常量防止反爬虫

      public static String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:49.0) Gecko/20100101 Firefox/49.0";

      public static String HOST = "i.snssdk.com";

      public static String REFERER = "https://i.snssdk.com/feoffline/hot_list/template/hot_list/forum_tab.html?activeWidget=1";
      
      public static void main(String[] args) throws IOException, SQLException {
          //根URL

          String url = "https://i.snssdk.com/forum/home/v1/info/?activeWidget=1&forum_id=1656784762444839";

            String resultBody = Jsoup.connect(url).

                userAgent(USER_AGENT).header("Host", HOST).header("Referer", REFERER).execute().body();
            JSONObject jsonObject = JSON.parseObject(resultBody);

            String ncovStringList = jsonObject.getJSONObject("forum").getJSONObject("extra").getString("ncov_string_list");

            JSONObject ncovListObj = JSON.parseObject(ncovStringList);
            JSONArray todaydata = ncovListObj.getJSONArray("provinces");
            QueryRunner queryRunner = new QueryRunner(DataSourceUtils.getDataSource());
            String sql = "insert ignore into todaydata_copy1 values(?,?,?,?,?,?,?,?)";
            String confirmedNum,deathsNum,cityname,cityid,treatingNum,provinceid;
            String reprovinceid=null;
            int confirmedNumSum=0,deathsNumSum=0,treatingNumSum=0;
            for(int i=0;i<todaydata.size();i++) {
                JSONObject todayData1 = todaydata.getJSONObject(i);
                String updateDate = todayData1.getString("updateDate");
                JSONArray city = todayData1.getJSONArray("cities");
                for(int j=0;j<city.size();j++) {
                    JSONObject cities = city.getJSONObject(j);
                    confirmedNum= cities.getString("confirmedNum");
                    deathsNum = cities.getString("deathsNum");
                    cityname = cities.getString("name");
                    cityid = cities.getString("id");
                    treatingNum = cities.getString("treatingNum");
                    provinceid = cityid.substring(0,2);
                    reprovinceid=provinceid;
                    confirmedNumSum+=Integer.parseInt(confirmedNum);
                    deathsNumSum+=Integer.parseInt(deathsNum);
                    treatingNumSum+=Integer.parseInt(treatingNum);
                    queryRunner.update(sql, updateDate,provinceid,cityname,confirmedNum,deathsNum,treatingNum,cityid,null);
                }
                queryRunner.update(sql,updateDate,reprovinceid,null,confirmedNumSum,deathsNumSum,treatingNumSum,null,null);    
                confirmedNumSum=0;
                deathsNumSum=0;
                treatingNumSum=0;
            }
          }
    }

利用python得到数据的方法:

import requests
import time, json
import sys;
import pymysql

def get_wangyi_request():
    url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-total'

    headers = {
        'accept': '*/*',
        'accept-encoding': 'gzip,deflate,br',
        'accept-language': 'en-US,en;q=0.9,zh-CN;q = 0.8,zh;q = 0.7',
        'origin': 'https://wp.m.163.com',
        'referer': 'https://wp.m.163.com/',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-ite',
        'user-agent': 'Mozilla/5.0(WindowsNT10.0;Win64;x64) AppleWebKit/37.36 (KHTML, likeGecko) Chrome/82.0.4056.0 Safari/537.36 Edg/82.0.432.3'
    }

    result = requests.get(url, headers=headers)
    return result


def print_mess1(string: str, dict1total: dict):
    sys.stdout.write(string + '确诊: ' + str(dict1total['confirm'] if dict1total['confirm'] != None else 0))
    sys.stdout.write(' ')
    sys.stdout.write(string + '疑似: ' + str(dict1total['suspect'] if dict1total['suspect'] != None else 0))
    sys.stdout.write(' ')
    sys.stdout.write(string + '治愈: ' + str(dict1total['heal'] if dict1total['heal'] != None else 0))
    sys.stdout.write(' ')
    sys.stdout.write(string + '死亡: ' + str(dict1total['dead'] if dict1total['dead'] != None else 0))
   sys.stdout.write(' ')
   
if __name__ == '__main__':
    result = get_wangyi_request()

    json_str = json.loads(result.text)['data']
    # print(json_str.keys())
    # dict_keys(['chinaTotal', 'chinaDayList', 'lastUpdateTime', 'areaTree'])

    print(json_str['lastUpdateTime'])
    provinae_list = json_str['areaTree'][0]['children']
    # 每个省份包含如下的键
    # dict_keys(['today', 'total', 'extData', 'name', 'id', 'lastUpdateTime', 'children'])

    conn = pymysql.connect(
        host='localhost',  # 我的IP地址
        port=3306,  # 不是字符串不需要加引号。
        user='root',
        password='20000604',
        db='database',
        charset='utf8'
    )

    cursor = conn.cursor()  # 获取一个光标
    id = 1;
    for dict in provinae_list:
        sql = 'insert into pachong (pronvice,total_confirm,total_suspect,total_heal,total_dead,today_confirm,today_suspect,today_heal,today_dead,lateUpdata,id) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);'
        pronvice=dict['name']
        today_lastUpdateTime=dict['lastUpdateTime']
        total_confirm=dict['total']['confirm']
        total_suspect=dict['total']['suspect']
        total_heal=dict['total']['heal']
        total_dead=dict['total']['dead']
        today_confirm=dict['today']['confirm']
        today_suspect=dict['today']['suspect']
        today_heal=dict['today']['heal']
        today_dead=dict['today']['dead']
       
        id=id+1
        sys.stdout.write( dict['name'] + '  ')
        cursor.execute(sql, [pronvice,today_lastUpdateTime, total_confirm, total_suspect,total_heal,total_dead,today_confirm,today_suspect,today_heal,today_dead,id])
    print()

    conn.commit()


    cursor.close()
    conn.close()

  学习总结:对网页理解的不足,导致很难找到真正需要的网页链接。对jsoup的掌握不深刻。

posted @ 2020-03-13 10:44  Protect_Winter  阅读(422)  评论(0)    收藏  举报