笔记,爬虫(5)

近期学习,虽不知所以然,也还是笨头笨脑的捣鼓了一点东西,稍有点感觉。

 

  •  参考:http://blog.csdn.net/abclixu123/article/details/39754619
import urllib.request
import datetime
import re
from bs4 import BeautifulSoup

a = '123abcdddd<dir>'
print(a)
print(a.strip('241'))
print(a.rstrip('<dir>'))
'''
声明:s为字符串,rm为要删除的字符序列
s.strip(rm)        删除s字符串中开头、结尾处,位于 rm删除序列的字符
s.lstrip(rm)       删除s字符串中开头处,位于 rm删除序列的字符
s.rstrip(rm)      删除s字符串中结尾处,位于 rm删除序列的字符
注意: 当rm为空时,默认删除空白符(包括'\n', '\r',  '\t',  ' ')
'''

starttime = datetime.datetime.now()

url = "https://www.packtpub.com/all"
page = urllib.request.urlopen(url)
soup_packtpage = BeautifulSoup(page, "html.parser")
#page.close()

endtime = datetime.datetime.now()
print(endtime - starttime)  # 计算网页打开时间

starttime = datetime.datetime.now()

all_book_title = soup_packtpage.find_all("div", class_="book-block-title")

print(all_book_title[5].text)

price_regexp = re.compile("\s+£\s\d+\.\d+")  # 空格,符号,空格,数字,小数点,数字。 这个是美元符号吗£,应该是$,我都不知道这个怎么打出来的
price_regexp = re.compile(".*?$")       # 贪婪模式比较简单,网页源代码也很单纯,所以不会出现爬错
for book_title in all_book_title:
    print("Book's name is " + book_title.text.strip())  # book_title.text与 book_title.find_next(text=True)等效
    book_price = book_title.find_next(price_regexp)     # 这里要注意text和string的区别,起初用string一直报错:AttributeError: 'NoneType' object has no attribute 'strip'
   # book_price = book_title.find_next("div", class_="book-block-price")
   # print(book_price.text)
    print("Book's price is ",  book_price.text.strip()) #strip也有去掉空行的功能,节约可视化空间(不知道这样说对不对)
    print("\n")

endtime = datetime.datetime.now()  # 挺冗长的语言,但直白,一看就懂,一用就错
print(endtime - starttime)

 

这里给出参考源代码(如下),我一直没跑通,究其原因,部分可能是HTML的代码与以前相比发生变化,导致提供的正则不再适用:

import urllib.request  
import datetime  
import re  
  
from bs4 import BeautifulSoup  
  
starttime = datetime.datetime.now()  
  
url = "https://www.packtpub.com/all"  
page = urllib.request.urlopen(url)  
soup_packtpage = BeautifulSoup(page)  
page.close()  
  
endtime = datetime.datetime.now()  
print (endtime - starttime)  
  
starttime = datetime.datetime.now()  
  
all_book_title = soup_packtpage.find_all("div", class_="book-block-title")  
  
  
price_regexp = re.compile(u"\s+£\s\d+\.\d+")  
  
for book_title in all_book_title:  
    print("Book's name is " + book_title.string.strip())  
    book_price = book_title.find_next(text=price_regexp)  
    print("Book's price is "+ book_price.strip())  
    print("\n")  
      
endtime = datetime.datetime.now() 

结果(只保留部分截图):

 

 对应爬的网页源代码:

 

  • 参考:http://blog.csdn.net/ben_ben_niao/article/details/40677869
import re
import urllib.request
import urllib
import os   # os.path模块详见:http://www.cnblogs.com/dkblog/archive/2011/03/25/1995537.html

def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    return html.decode('UTF-8')

def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'  # 要加括号,作为元组返回,抓取淘宝的图片png(先看源码中图片的地址路径)reg = r'data-lazy="(.+?\.png)" '
    imgre = re.compile(reg)
    imglist = imgre.findall(html)
    
    path = 'f:\\test'
    if not os.path.isdir(path):
        os.makedirs(path)    # 判断路径是否为目录,如果不是,自动建立
    paths = path + '\\'     # 保存在test路径下

    x = 0
    for imgurl in imglist:
        urllib.request.urlretrieve(imgurl, '{}{}.jpg'.format(paths, x))  # 新学到的图形文件保存方式!urlretrieve() 方法直接将远程数据下载到本地。
        x += 1   # x = x + 1

html = getHtml("http://tieba.baidu.com/p/2460150866")  # 淘宝的:html = getHtml(r"http://www.taobao.com/")
getImg(html)

结果(右边是对应的图片源代码):

 

  • 参考:http://blog.csdn.net/omuyejingfeng1/article/details/24261203/
import urllib
import urllib.request as request
from bs4 import BeautifulSoup

def taobao(url):
    response = request.urlopen(url)
    html = response.read()
    data = html.decode('utf-8')
    soup = BeautifulSoup(data, "html.parser")
    path = 'f:/image/'
    count = 1
    st = 'http:'
    for list in soup.find_all('img'):          #拆分属性
        dict = list.attrs
        #print(dict)
        if "data-src" in dict:
            image = dict['data-src']  #寻找属性有data-src关键字的图
           # print(image)
            img = image[image.rfind('.')::]  # 保存图的后缀png,JPG
            # print(img)
            if not "http" in image:  # 因为有的URL是完整的,有的少http,所以进行判断
                image = st + image
                print(image)
            filepath = path + str(count)+img
            urllib.request.urlretrieve(image, '{}'.format(filepath))  # image必须带有完整的网址,程序才能顺藤摸瓜将图片下载下来
            count += 1

if __name__ == '__main__':
    url = 'http://www.taobao.com/?spm=a310q.2219005.1581860521.1.b9kUd4'
    taobao(url)

 

原来的程序(如下),没能跑通,而且有些错误(可能无意识),可以进行对比发现:

import urllib  
import urllib.request as request  
from bs4 import BeautifulSoup  
def taobao(url):  
    response = request.urlopen(url)  
    html = response.read()  
    #我是win7系统,默认是gdk要先解码,再用utf8编码就可以显示汉字了  
    data = html.decode('gbk').encode('utf-8')  
    soup = BeautifulSoup(data)  
    path = 'f:/image/'  
    count = 1  
    for list in soup.find_all('img'):  
        #拆分属性  
        dict = list.attrs  
        if "data-lazy" in dict:  
            image = dict['data-lazy']  
            img = image[image.rfind('.')::]  
            filepath = path + str(count)+img  
            with open(filepath, 'wb') as file:  
                image_data = request.urlopen(dict['data-lazy']).read()  
                print(dict['data-lazy'])  
                file.write(image_data)  
            count += 1  
            file.close()  
if __name__ == '__main__':  
    print(""" 
+++++++++++++++++++++++ 
  学校:超神学院 
  专业:德玛班 
  姓名:德玛之力 
  version: python3.2 
+++++++++++++++++=++++ 
     """)  
    url = 'http://www.taobao.com/?spm=a310q.2219005.1581860521.1.b9kUd4'  
    taobao(url)  

 

结果(右边是对应的图片源代码),不足的是链接图片没有下载,只下载位于本页面的图片:

 

 

posted @ 2016-09-20 17:10  CC_python  阅读(87)  评论(0)    收藏  举报