linzeX

第二次大作业

作业一

点击查看代码
 
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
    def openDB(self):
        self.con=sqlite3.connect("111.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")
    def closeDB(self):
        self.con.commit()
        self.con.close()
    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)
    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return
        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()
        for city in cities:
            self.forecastCity(city)
    # self.db.show()
        self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("031904118林泽熙")
  

心得体会

  • 该作业为复现作业,通过学习主要了解到爬取网页内容后如何加入到数据库中,其他内容都是已学习过的爬取过程,有正确代码直接看学习效率太高了!

作业二

  • 用requests和自选提取信息方法定向爬取股票相关信息,并存储在数据库中。
    (在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,
    并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、
    f2可获取不同的数值,根据情况可删减请求的参数。)

    输出内容

    序号 股票代码 股票名称 涨跌幅 涨跌额 成交量 成交额 振幅 最高 最低 今开 昨收
    1 688093 N世华 28.47 10 344213 51242141 -0.33 321 123 28 20
    2 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

    结果展示

  • 码云链接
    ------作业2-----

  • 代码

点击查看代码
 
import re
import urllib.request
import sqlite3
class gpDB:
    ##打开或创建数据库
    def openDB(self):
        self.con=sqlite3.connect("ewe.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table ewe (序号 varchar(10),股票代码 varchar(10),股票名称 varchar(10),最新报价 varchar(10),涨跌幅 varchar(10), 涨跌额 varchar(10),成交量 varchar(10),成交额 varchar(10),振幅 varchar(10),昨收 varchar(10),今收 varchar(10),最高 varchar(10),最低 varchar(10),constraint pk_weather primary key (股票代码))")
        except:
            self.cursor.execute("delete from ewe")
    ##关闭数据库
    def closeDB(self):
        self.con.commit()
        self.con.close()
    #插入数据库
    def insert(self, no,number,name,newest,yesterday,today,highest,lowest ,f3,f4 ,f5 ,f6 ,f7):
        try:
            self.cursor.execute("insert into ewe (序号,股票代码,股票名称,最新报价,涨跌幅,涨跌额 ,成交量 ,成交额 ,振幅,昨收,今收,最高,最低) values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
                                (no,number,name,newest,f3,f4 ,f5 ,f6 ,f7,yesterday,today,highest,lowest ))
        except Exception as err:
            print(err)
    ##控制台显示数据
    def show(self):
        self.cursor.execute("select * from ewe")
        rows = self.cursor.fetchall()
        print("序号       股票代码       股票名称        最新报价        涨跌幅       涨跌额        成交量        成交额         昨收       今开      最高      最低" )
        for row in rows:
            print("%-16s%-16s%-32s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s" % (row[0], row[1], row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12]))
##得到页面内容
def getHtml(url):
    page = urllib.request.urlopen(url)
    page=page.read()
    page=page.decode('utf-8')
    html = page
    return html
class poss:
    ##正则匹配数据 写入数据库
    def url_request(self):
        html = getHtml("http://76.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112408139454415054219_1634087992156&pn=1&pz=50&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=&fs=b:MK0010&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f26,f22,f11,f62,f128,f136,f115,f152&_=1634087992177")
        number=re.findall(r'"f12":"(\d*)"',html)
        name=re.findall(r'"f14":"(.*?)"',html)
        newest = re.findall(r'"f2":(\d*.\d*)', html)
        yesterday = re.findall(r'"f15":(\d*.\d*)', html)
        today = re.findall(r'"f16":(\d*.\d*)', html)
        highest = re.findall(r'"f17":(\d*.\d*)', html)
        lowest = re.findall(r'"f18":(\d*.\d*)', html)
        f3=re.findall(r'"f3":(\d*.\d*)', html)
        f4 = re.findall(r'"f4":(\d*.\d*)', html)
        f5 = re.findall(r'"f5":(\d*.\d*)', html)
        f6 = re.findall(r'"f6":(\d*.\d*)', html)
        f7 = re.findall(r'"f7":(\d*.\d*)', html)
        for i in range(len(f7)-1):
            self.db.insert(str(i+1),str(number[i]),str(name[i]),str(newest[i]),str(yesterday[i]),str(today[i]),str(highest[i]),str(lowest[i]) ,str(f3[i]),str(f4[i]) ,str(f5[i]) ,str(f6[i]) ,str(f7[i]))
    ##爬取过程
    def process(self):
        self.db = gpDB()
        self.db.openDB()
        self.url_request()
        self.db.show()
        self.db.closeDB()
ws = poss()
ws.process()
print("031904118林泽熙")
  

心得体会

  • 难度1在于一个个抓包

    难度2在于复现创建数据库过程,不过有模板就很好抄了
    难度3的话股票名称由数字和中文字符组成,想要规范在pycharm里输出似乎有点麻烦。不过还好是导入数据库

作业三

点击查看代码
 

import re
import urllib.request
import sqlite3
##获取网页内容
def getHtml(url):
    page = urllib.request.urlopen(url)
    page=page.read()
    page=page.decode('utf-8')
    html = page
    return html
def url_request():
    con = sqlite3.connect("xuexiao.db")
    cursor = con.cursor()
    cursor.execute(
        "create table xuexiao (no varchar(10),name varchar(10), grade varchar(10) ,constraint pk_weather primary key (name))")
    html = getHtml("https://www.shanghairanking.cn/_nuxt/static/1632381606/rankings/bcur/2021/payload.js")
    #正则匹配排名层次学校名称和分数
    schoolclass=re.findall(r'univNameCn:"(.*?)"',html)
    grade=re.findall(r'score:(.*?),',html)
    # ##格式化输出答案
    print('{0:^10}{1:^32}{2:^12}'.format('排名','学校名称','分数'))
    ##遍历所有列表
    for i in range(len(schoolclass)):
        cursor.execute(
            "insert into xuexiao (no,name,grade ) values (?,?,?)",
            (str(i+1),str(schoolclass[i]),str(grade[i])))
        print('{0:^12}'.format(i+1),end='')
        print('{0:{1}^22}'.format(schoolclass[i],chr(12288)),end='')
        print('{0:^10}'.format(grade[i]))
    con.commit()
    con.close()
url_request()
print("031904118 林泽熙")
  
  • 抓包演示:

心得体会

  • 改了下前面的写数据库代码,发现量少很多
    GIF真难弄,就给十秒时间

posted on 2021-10-19 18:57  linzeX  阅读(12)  评论(0编辑  收藏  举报

导航