笔记，爬虫（5）

近期学习，虽不知所以然，也还是笨头笨脑的捣鼓了一点东西，稍有点感觉。

参考：http://blog.csdn.net/abclixu123/article/details/39754619

import urllib.request
import datetime
import re
from bs4 import BeautifulSoup

a = '123abcdddd<dir>'
print(a)
print(a.strip('241'))
print(a.rstrip('<dir>'))
'''
声明：s为字符串，rm为要删除的字符序列
s.strip(rm)        删除s字符串中开头、结尾处，位于 rm删除序列的字符
s.lstrip(rm)       删除s字符串中开头处，位于 rm删除序列的字符
s.rstrip(rm)      删除s字符串中结尾处，位于 rm删除序列的字符
注意： 当rm为空时，默认删除空白符（包括'\n', '\r',  '\t',  ' ')
'''

starttime = datetime.datetime.now()

url = "https://www.packtpub.com/all"
page = urllib.request.urlopen(url)
soup_packtpage = BeautifulSoup(page, "html.parser")
#page.close()

endtime = datetime.datetime.now()
print(endtime - starttime)  # 计算网页打开时间

starttime = datetime.datetime.now()

all_book_title = soup_packtpage.find_all("div", class_="book-block-title")

print(all_book_title[5].text)

price_regexp = re.compile("\s+£\s\d+\.\d+")  # 空格，符号，空格，数字，小数点，数字。 这个是美元符号吗£，应该是$，我都不知道这个怎么打出来的
price_regexp = re.compile(".*?$")       # 贪婪模式比较简单，网页源代码也很单纯，所以不会出现爬错
for book_title in all_book_title:
    print("Book's name is " + book_title.text.strip())  # book_title.text与 book_title.find_next(text=True)等效
    book_price = book_title.find_next(price_regexp)     # 这里要注意text和string的区别，起初用string一直报错：AttributeError: 'NoneType' object has no attribute 'strip'
   # book_price = book_title.find_next("div", class_="book-block-price")
   # print(book_price.text)
    print("Book's price is ",  book_price.text.strip()) #strip也有去掉空行的功能，节约可视化空间（不知道这样说对不对）
    print("\n")

endtime = datetime.datetime.now()  # 挺冗长的语言，但直白，一看就懂，一用就错
print(endtime - starttime)

这里给出参考源代码（如下），我一直没跑通，究其原因，部分可能是HTML的代码与以前相比发生变化，导致提供的正则不再适用：

import urllib.request  
import datetime  
import re  
  
from bs4 import BeautifulSoup  
  
starttime = datetime.datetime.now()  
  
url = "https://www.packtpub.com/all"  
page = urllib.request.urlopen(url)  
soup_packtpage = BeautifulSoup(page)  
page.close()  
  
endtime = datetime.datetime.now()  
print (endtime - starttime)  
  
starttime = datetime.datetime.now()  
  
all_book_title = soup_packtpage.find_all("div", class_="book-block-title")  
  
  
price_regexp = re.compile(u"\s+£\s\d+\.\d+")  
  
for book_title in all_book_title:  
    print("Book's name is " + book_title.string.strip())  
    book_price = book_title.find_next(text=price_regexp)  
    print("Book's price is "+ book_price.strip())  
    print("\n")  
      
endtime = datetime.datetime.now()

结果（只保留部分截图）：

对应爬的网页源代码：

参考：http://blog.csdn.net/ben_ben_niao/article/details/40677869

import re
import urllib.request
import urllib
import os   # os.path模块详见:http://www.cnblogs.com/dkblog/archive/2011/03/25/1995537.html

def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    return html.decode('UTF-8')

def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'  # 要加括号，作为元组返回，抓取淘宝的图片png(先看源码中图片的地址路径)reg = r'data-lazy="(.+?\.png)" '
    imgre = re.compile(reg)
    imglist = imgre.findall(html)
    
    path = 'f:\\test'
    if not os.path.isdir(path):
        os.makedirs(path)    # 判断路径是否为目录，如果不是，自动建立
    paths = path + '\\'     # 保存在test路径下

    x = 0
    for imgurl in imglist:
        urllib.request.urlretrieve(imgurl, '{}{}.jpg'.format(paths, x))  # 新学到的图形文件保存方式！urlretrieve() 方法直接将远程数据下载到本地。
        x += 1   # x = x + 1

html = getHtml("http://tieba.baidu.com/p/2460150866")  # 淘宝的：html = getHtml(r"http://www.taobao.com/")
getImg(html)

结果（右边是对应的图片源代码）：

参考：http://blog.csdn.net/omuyejingfeng1/article/details/24261203/

import urllib
import urllib.request as request
from bs4 import BeautifulSoup

def taobao(url):
    response = request.urlopen(url)
    html = response.read()
    data = html.decode('utf-8')
    soup = BeautifulSoup(data, "html.parser")
    path = 'f:/image/'
    count = 1
    st = 'http:'
    for list in soup.find_all('img'):          #拆分属性
        dict = list.attrs
        #print(dict)
        if "data-src" in dict:
            image = dict['data-src']  #寻找属性有data-src关键字的图
           # print(image)
            img = image[image.rfind('.')::]  # 保存图的后缀png，JPG
            # print(img)
            if not "http" in image:  # 因为有的URL是完整的，有的少http，所以进行判断
                image = st + image
                print(image)
            filepath = path + str(count)+img
            urllib.request.urlretrieve(image, '{}'.format(filepath))  # image必须带有完整的网址，程序才能顺藤摸瓜将图片下载下来
            count += 1

if __name__ == '__main__':
    url = 'http://www.taobao.com/?spm=a310q.2219005.1581860521.1.b9kUd4'
    taobao(url)

原来的程序（如下），没能跑通，而且有些错误（可能无意识），可以进行对比发现：

import urllib  
import urllib.request as request  
from bs4 import BeautifulSoup  
def taobao(url):  
    response = request.urlopen(url)  
    html = response.read()  
    #我是win7系统，默认是gdk要先解码，再用utf8编码就可以显示汉字了  
    data = html.decode('gbk').encode('utf-8')  
    soup = BeautifulSoup(data)  
    path = 'f:/image/'  
    count = 1  
    for list in soup.find_all('img'):  
        #拆分属性  
        dict = list.attrs  
        if "data-lazy" in dict:  
            image = dict['data-lazy']  
            img = image[image.rfind('.')::]  
            filepath = path + str(count)+img  
            with open(filepath, 'wb') as file:  
                image_data = request.urlopen(dict['data-lazy']).read()  
                print(dict['data-lazy'])  
                file.write(image_data)  
            count += 1  
            file.close()  
if __name__ == '__main__':  
    print(""" 
+++++++++++++++++++++++ 
  学校：超神学院 
  专业：德玛班 
  姓名：德玛之力 
  version: python3.2 
+++++++++++++++++=++++ 
     """)  
    url = 'http://www.taobao.com/?spm=a310q.2219005.1581860521.1.b9kUd4'  
    taobao(url)

结果（右边是对应的图片源代码），不足的是链接图片没有下载，只下载位于本页面的图片：

posted @ 2016-09-20 17:10 CC_python 阅读(87) 评论(0) 收藏举报

刷新页面返回顶部

CC_python

笔记，爬虫（5）

公告