数据采集第六次作业

作业①

(1)、要求:

用requests和BeautifulSoup库方法爬取豆瓣电影Top250数据。
每部电影的图片,采用多线程的方法爬取,图片名字为电影名
了解正则的使用方法

编写爬虫程序

import re
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading


def spider(start_url):
    global threads
    global urls
    global count
    try:
        req = urllib.request.Request(start_url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data, 'html.parser')
        olTag = soup.find('ol', class_='grid_view')
        details = olTag.find_all('li')
        for detail in details:
            movieRank = detail.find('em').text  # 电影排名
            movieName = '《' + detail.find('span', class_='title').text + '》'  # 电影名称
            movieScore = detail.find('span', class_='rating_num').text + '分'  # 电影评分
            movieCommentNum = detail.find(text=re.compile('\d+人评价')).string  # 评价人数
            movieReview = '"' + detail.find('span', class_='inq').text + '"'  # 电影短评
            movieP = detail.find('p').text
            movieP1 = movieP.split('\n')[1]
            movieP2 = movieP.split('\n')[2]
            movieDirector = movieP1.split('\xa0')[0].strip()[4:]  # 导演
            movieYear = re.findall(r'\d{4}', movieP2)[0]  # 上映年份
            movieCountry = movieP2.split('\xa0/\xa0')[-2]  # 制片国家
            movieType = movieP2.split('\xa0/\xa0')[-1].strip()  # 电影类型
            print(movieRank, movieName, movieDirector, movieYear, movieCountry, movieType, movieScore,
                  movieCommentNum, movieReview)

        # 爬取图片
        images = soup.select("img")
        for image in images:
            try:
                url = image["src"]  # 图片下载链接
                name = image["alt"]  # 图片名称
                print(url)
                if url not in urls:
                    urls.append(url)
                    print(url)
                    T = threading.Thread(target=download, args=(url, name))
                    T.setDaemon(False)
                    T.start()
                    threads.append(T)
            except Exception as err:
                print(err)

    except Exception as err:
        print(err)

def download(url, name):
        try:
            req = urllib.request.Request(url, headers=headers)
            data = urllib.request.urlopen(req, timeout=100)
            data = data.read()
            path = "images_\\" + name + ".png"  # 保存为png格式
            fobj = open(path, "wb")
            fobj.write(data)
            fobj.close()
            print("download" + name + ".png")
        except Exception as err:
            print(err)

headers = {
    "User-Agent": "Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"
}

urls = []  
threads = []  
# 共10页
for i in range(0, 10):
    start_url = "https://movie.douban.com/top250?start=" + str(i * 25) + "&filter="
    print("start Spider")
    spider(start_url)

for t in threads:
    t.join()

输出信息


(2)、心得体会

复习了多线程和BeautifulSoup的知识,刚开始用了非常复杂的方法但是一直报错,在参考了书上的内容之后终于爬了出来,还有就是定位有的时候会错,要多加注意

作业②

(1)、要求:

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取科软排名信息
爬取科软学校排名,并获取学校的详细链接,进入下载学校Logo存储、获取官网Url、院校信息等内容。

编写爬虫程序

import urllib
import scrapy
from ..items import RankItem
from bs4 import UnicodeDammit
class MySpider(scrapy.Spider):
    name = "mySpider"
    start_urls=['https://www.shanghairanking.cn/rankings/bcur/2020']

    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            tds = selector.xpath("//div[@class='rk-table-box']//tr")
            for td in tds[1:]:
                sNo = td.xpath("./td[position()=1]/text()").extract_first()
                schoolName =td.xpath("./td[position()=2]/a/text()").extract_first()
                city = td.xpath("./td[position()=3]/text()").extract_first()
                Url =td.xpath("./td[position()=2]/a/@href").extract_first()
                url=response.urljoin(Url)
                # print(url)
                req = urllib.request.Request(url)
                html = urllib.request.urlopen(req)
                html = html.read()
                dammit = UnicodeDammit(html, ["utf-8", "gbk"])
                html = dammit.unicode_markup
                selector1 = scrapy.Selector(text=html)
                officialUrl = selector1.xpath("//div[@class='univ-website']/a/text()").extract_first()
                mFile = selector1.xpath("//td[@class='univ-logo']/img/@src").extract_first()
                try:
                    info = selector1.xpath("//div[@class='univ-introduce']/p/text()").extract_first()
                except:
                    info = ""

                item = RankItem()
                item["sNo"] = sNo.strip() if sNo else ""
                sNo1=sNo.strip() if sNo else ""
                print(sNo1)
                item["schoolName"] =schoolName.strip() if schoolName else ""
                item["city"] = city.strip() if city else ""
                item["officialUrl"] = officialUrl.strip() if officialUrl else ""
                item["info"] = info.strip() if info else ""
                item["mFile"] = str(sNo1)+".jpg"
                yield item
        except Exception as err:
            print(err)

pipelines.py

import urllib
import pymysql
class RankPipeline:
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root",
                                       passwd="root", db="mydb", charset="gbk")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from ranks")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")

    def download(self,url, count):
        UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari" \
                    "/537.36 Edge/18.18363"
        headers = {'User-Agent': UserAgent}
        try:
            if (url[len(url) - 4] == "."):
                ext = url[len(url) - 4:]
            else:
                ext = ""
            req = urllib.request.Request(url,headers=headers)
            data = urllib.request.urlopen(req, timeout=100)
            data = data.read()
            fobj = open("E:\\wangluopachong_6\\images_2\\" + str(count) + ext, "wb")
            fobj.write(data)
            fobj.close()
            print("downloaded " + str(count) + ext)
        except Exception as err:
            print(err)

    def process_item(self, item, spider):
        try:
            # print(item["sNo"])
            # print(item["schoolName"])
            # print(item["city"])
            # print(item["officialUrl"])
            # print(item["info"])
            # print()
            if self.opened:
                self.count += 1
                self.cursor.execute(
                    "insert into ranks(sNo,schoolName,city,officialUrl,info,mFile)values(%s,%s,%s,%s,%s,%s)",
                    (item["sNo"], item["schoolName"], item["city"],
                     item["officialUrl"], item["info"], item["mFile"]))
            self.download(item["url"], item["sNo"])
        except Exception as err:
            print(err)

        return item

输出信息



(2)、心得体会:

在编写代码的过程中,对scrapy的知识有些许遗忘,也犯了许多幼稚的错误,今后还是要多加练习

作业③

(1)、要求:

熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)

编写爬虫程序

import pymysql
from selenium import webdriver
import datetime
import time
from selenium.webdriver.support import ui

class MySpider:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
    def startUp(self, url):
        # Initializing Chrome browser
        self.driver = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        # Initializing variables
        self.threads = []
        self.count = 0
        self.page = 1
        # Initializing database
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="root", db="mydb",charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            try:
                # 如果有表就删除
                self.cursor.execute("drop table courses2")
            except Exception as err:
                print(err)
            try:
                #  建立新的表
                sql = "create table courses2(Id int,cCourse VARCHAR (32),cCollege VARCHAR(32),cTeacher VARCHAR(32),cTeam VARCHAR(32),cCount VARCHAR(32),cProcess VARCHAR(32),cBrief VARCHAR(512))"
                self.cursor.execute(sql)
            except Exception as err:
                print(err)
        except Exception as err:
            print(err)
            # Initializing images folder
            # 请求登录页面
        try:
            wait = ui.WebDriverWait(self.driver, 10)
            self.driver.get(url)
            time.sleep(2)  # 睡眠2秒
            self.driver.find_element_by_xpath("//div[@class='_1Y4Ni']//div[@role='button']").click()
            time.sleep(0.5) # 休眠0.5秒钟后执行填写用户名和密码操作
            self.driver.find_element_by_xpath("//span[@class='ux-login-set-scan-code_ft_back']").click()
            time.sleep(0.5)
            self.driver.find_elements_by_xpath("//ul[@class='ux-tabs-underline_hd']//li")[1].click()
            time.sleep(2)  # 找到用户名输入用户名
            frame_ = self.driver.find_element_by_xpath("//div[@class='ux-login-set-container']//iframe")
            self.driver.switch_to.frame(frame_)

            wait.until(lambda driver: driver.find_element_by_xpath("//div[@class='u-input box']//input"))
            self.driver.find_element_by_xpath("//div[@class='u-input box']//input").send_keys('15259181235')
            wait.until(lambda driver: driver.find_element_by_xpath("//input[@placeholder='请输入密码']"))
            self.driver.find_element_by_xpath("//input[@placeholder='请输入密码']").send_keys("123456789blh")  # 点击登录按钮实现登录
            self.driver.find_element_by_id('submitBtn').click()  # 登录成功后跳转首页,进行加载,休眠10秒加载页面
            time.sleep(10)
            self.driver.find_element_by_xpath("//div[@class='u-navLogin-myCourse-t']//span[@class='nav']").click()
            time.sleep(10)

        except Exception as err:
            print(err)

    def closeUp(self):
        try:

            self.con.commit()
            self.con.close()
            self.driver.close()

        except Exception as err:
            print(err);

    def insertDB(self, id, ccourse, ccollege, cteacher, cTeam, ccount, cprocess, cbrief):
        try:
            sql = "insert into courses2(id,ccourse,ccollege,cteacher,cTeam,ccount,cprocess,cbrief)values(%s,%s,%s,%s,%s,%s,%s,%s)"
            self.cursor.execute(sql,(id,ccourse,ccollege,cteacher,cTeam,ccount,cprocess,cbrief))
        except Exception as err:
            print(err)


    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            divs = self.driver.find_elements_by_xpath("//div[@class='course-card-wrapper']")
            for div in divs:
                try:

                    ccourse = div.find_element_by_xpath(".//span[@class='text']").text
                    ccollege = div.find_element_by_xpath(".//div[@class='school']/a").text
                except Exception as err:
                    print(err)
                    print("爬取失败1")
                try:
                    self.driver.execute_script("arguments[0].click();",div.find_element_by_xpath(".//div[@class='menu']//a[@class='ga-click']"))
                    # 跳转到新打开的页面
                    self.driver.switch_to.window(self.driver.window_handles[-1])
                    time.sleep(2)
                    ccount=self.driver.find_element_by_xpath("//span[@class='course-enroll-info_course-enroll_price-enroll_enroll-count']").text
                    cprocess = self.driver.find_element_by_xpath("//div[@class='course-enroll-info_course-info_term-info_term-time']/span[2]").text
                    cbrief = self.driver.find_element_by_xpath("//div[@class='course-heading-intro_intro']").text
                    cteacher = self.driver.find_element_by_xpath("//h3[@class='f-fc3']").text
                    cteam = self.driver.find_elements_by_xpath("//h3[@class='f-fc3']")
                    cTeam = ""
                    for t in cteam:
                        cTeam += " " + t.text
                    # 关闭新页面,返回原来的页面
                    self.driver.close()
                    self.driver.switch_to.window(self.driver.window_handles[0])
                    self.count += 1
                    id = self.count
                    print(ccourse, ccollege,ccount,cprocess,cbrief,cTeam,cteacher)
                    try:
                        self.insertDB(id, ccourse, ccollege, cteacher, cTeam, ccount, cprocess, cbrief)
                    except Exception as err:
                        print(err)
                        print("插入失败")
                except Exception as err:
                    print(err)
                    print("爬取失败2")
        except Exception as err:
            print(err)

    def executeSpider(self, url):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")

url = "https://www.icourse163.org/"
spider = MySpider()
while True:
    print("1.爬取")
    print("2.退出")
    s = input("请选择(1,2):")
    if s == "1":
        spider.executeSpider(url)
    elif s == "2":
        break

输出信息



(2)、心得体会

①用selenium定位页面元素的时候会遇到定位不到的问题,明明元素就在那儿,就是定位不到.selenium提供了switch_to.frame()方法来切换frame

②NoSuchElementException: Message: Unable to locate element。可能元素加载未完成,添加两行代码

wait = ui.WebDriverWait(driver,10)

wait.until(lambda driver: driver.find_element_by_方法("定位路径自己来"))

posted @ 2020-12-01 17:20  呱506  阅读(106)  评论(0)    收藏  举报