python : 3.5

jdk : 1.7

eclipse : 4.5.2(有点低了,需要对应Neon 4.6,不然总是会弹出提示框)

 

 

 

 

应该学习最新版本的 Python 3 还是旧版本的 Python 2.7?

 

MySqlDB官网只支持Python3.4,这里Python3.5使用第三方库PyMysql连接Mysql数据库。

http://dev.mysql.com/downloads/connector/python/2.0.html

image

PyMysql下载地址:

https://pypi.python.org/pypi/PyMySQL#downloads

Windows下安装方法:

下载解压后,进入PyMySql-0.6.7目录,执行python setup.py install安装

image

 

 

test1.py

 1 import urllib.request as request
 2 def baidu_tieba(url, begin_page, end_page):
 3     for i in range(begin_page, end_page + 1):
 4         sName = 'D:/360Downloads/test/'+str(i).zfill(5)+'.html'
 5         print('正在下载第'+str(i)+'个页面, 并保存为'+sName)
 6         m = request.urlopen(url+str(i)).read()
 7         with open(sName,'wb') as file:
 8             file.write(m)
 9         file.close()
10 if __name__ == "__main__":
11     url = "http://tieba.baidu.com/p/"
12     begin_page = 1
13     end_page = 3
14     baidu_tieba(url, begin_page, end_page)

 

test2.py

 1 import urllib.request as request
 2 import re
 3 import os
 4 import urllib.error as error
 5 def baidu_tieba(url, begin_page, end_page):
 6     count = 1
 7     for i in range(begin_page, end_page + 1):
 8         sName = 'D:/360Downloads/test/' + str(i).zfill(5) + '.html'
 9         print('正在下载第' + str(i) + '个页面, 并保存为' + sName)
10         m = request.urlopen(url + str(i)).read()
11         # 创建目录保存每个网页上的图片
12         dirpath = 'D:/360Downloads/test/'
13         dirname = str(i)
14         new_path = os.path.join(dirpath, dirname)
15         if not os.path.isdir(new_path):
16             os.makedirs(new_path)
17         page_data = m.decode('gbk', 'ignore')   
18         page_image = re.compile('<img src=\"(.+?)\"')
19         for image in page_image.findall(page_data):
20             pattern = re.compile(r'^http://.*.png$')
21             if  pattern.match(image):
22                 try:
23                     image_data = request.urlopen(image).read()
24                     image_path = dirpath + dirname + '/' + str(count) + '.png'
25                     count += 1
26                     print(image_path)
27                     with open(image_path, 'wb') as image_file:
28                         image_file.write(image_data)
29                     image_file.close()
30                 except error.URLError as e:
31                     print('Download failed')
32         with open(sName, 'wb') as file:
33             file.write(m)
34         file.close()
35 if __name__ == "__main__":
36     url = "http://tieba.baidu.com/p/"
37     begin_page = 1
38     end_page = 3
39     baidu_tieba(url, begin_page, end_page)

 

 

test3.py

 1 #python3.4 爬虫教程
 2 #爬取网站上的图片
 3 #林炳文Evankaka(博客:http://blog.csdn.net/evankaka/)
 4 import urllib.request  
 5 import socket  
 6 import re  
 7 import sys  
 8 import os  
 9 targetDir = r"D:\PythonWorkPlace\load"  #文件保存路径
10 def destFile(path):  
11     if not os.path.isdir(targetDir):  
12         os.makedirs(targetDir)  
13     pos = path.rindex('/')  
14     t = os.path.join(targetDir, path[pos+1:])  
15     print(t)
16     return t  
17 if __name__ == "__main__":  #程序运行入口
18     weburl = "http://www.douban.com/"
19     webheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} 
20     req = urllib.request.Request(url=weburl, headers=webheaders)  #构造请求报头
21     webpage = urllib.request.urlopen(req)  #发送请求报头
22     contentBytes = webpage.read()  
23     for link, t in set(re.findall(r'(https:[^\s]*?(jpg|png|gif))', str(contentBytes))):  #正则表达式查找所有的图片
24         print(link)
25         try: 
26             urllib.request.urlretrieve(link, destFile(link)) #下载图片
27         except:
28             print('失败') #异常抛出

 

 

test4.py

 1 '''
 2 第一个示例:简单的网页爬虫
 3  
 4 爬取豆瓣首页
 5 '''
 6  
 7 import urllib.request
 8  
 9 #网址
10 url = "http://bj.58.com/caishui/28707491160259x.shtml?adtype=1&entinfo=28707491160259_0&adact=3&psid=156713756196890928513274724"
11  
12 #请求
13 request = urllib.request.Request(url)
14  
15 #爬取结果
16 response = urllib.request.urlopen(request)
17  
18 data = response.read()
19  
20 #设置解码方式
21 data = data.decode('utf-8')
22  
23 #打印结果
24 print(data)
25  
26 #打印爬取网页的各类信息
27  
28 # print(type(response))
29 # print(response.geturl())
30 # print(response.info())
31 # print(response.getcode())

 

test5.py

 1 #!/usr/bin/env python
 2 #-*-coding: utf-8 -*-
 3 import re
 4 import urllib.request as request
 5 from bs4 import BeautifulSoup as bs
 6 import csv
 7 import os
 8 import sys
 9 from imp import reload 
10 reload(sys)
11  
12 def GetAllLink():
13     num = int(input("爬取多少页:>"))
14     if not os.path.exists('./data/'):
15         os.mkdir('./data/')
16      
17     for i in range(num):
18         if i+1 == 1:
19             url = 'http://nj.58.com/piao/'
20             GetPage(url, i)
21         else:
22             url = 'http://nj.58.com/piao/pn%s/' %(i+1)
23             GetPage(url, i)
24  
25  
26 def GetPage(url, num):
27     Url = url
28     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
29     headers = { 'User-Agent' : user_agent }
30     req = request.Request(Url, headers = headers)
31     page = request.urlopen(req).read().decode('utf-8')
32     soup = bs(page, "html.parser")
33     table = soup.table
34     tag = table.find_all('tr')
35     # 提取出所需的那段
36     soup2 = bs(str(tag), "html.parser")
37     title = soup2.find_all('a','t')         #标题与url 
38     price = soup2.find_all('b', 'pri')      #价格
39     fixedprice = soup2.find_all('del')      #原价
40     date = soup2.find_all('span','pr25')    #时间 
41  
42     atitle = []
43     ahref = []
44     aprice = []
45     afixedprice = []
46     adate = []
47  
48     for i in title:
49         #print i.get_text(), i.get('href')
50         atitle.append(i.get_text())
51         ahref.append(i.get('href'))
52     for i in price:
53         #print i.get_text()
54         aprice.append(i.get_text())
55     for i in fixedprice:
56         #print j.get_text()
57         afixedprice.append(i.get_text())
58     for i in date:
59         #print i.get_text()
60         adate.append(i.get_text())
61 
62     csvfile = open('./data/ticket_%s.csv'%num, 'w')
63     writer = csv.writer(csvfile)
64     writer.writerow(['标题','url','售价','原价','演出时间'])
65     '''
66     每个字段必有title,但是不一定有时间date
67     如果没有date日期,我们就设为'---'
68     '''
69     if len(atitle) > len(adate):
70         for i in range(len(atitle) - len(adate)):
71             adate.append('---')
72         for i in range(len(atitle) - len(afixedprice)):
73             afixedprice.append('---')
74         for i in range(len(atitle) - len(aprice)):
75             aprice.append('---')
76             
77     for i in range(len(atitle)):
78             message = atitle[i]+'|'+ahref[i]+'|'+aprice[i]+ '|'+afixedprice[i]+'|'+ adate[i]
79             writer.writerow([i for i in str(message).split('|')])
80     print ("[Result]:> 页面 %s 信息保存完毕!"%(num+1))
81     csvfile.close()
82  
83  
84 if __name__ == '__main__':
85     GetAllLink()

 

test6.py

 1 #!/usr/bin/env python
 2 #-*-coding: utf-8 -*-
 3 import urllib.request as request
 4 from bs4 import BeautifulSoup as bs
 5 import sys
 6 from imp import reload 
 7 reload(sys)
 8  
 9 def GetAllLink():
10     num = int(input("爬取多少页:>"))
11      
12     for i in range(num):
13         if i+1 == 1:
14             url = 'http://bj.58.com/caishui/?key=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&cmcskey=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&final=1&jump=1&specialtype=gls'
15             GetPage(url, i)
16         else:
17             url = 'http://bj.58.com/caishui/pn%s/'%(i+1)+'?key=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&cmcskey=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&final=1&specialtype=gls&PGTID=0d30215f-0000-1941-5161-367b7a641048&ClickID=4' 
18             GetPage(url, i)
19  
20  
21 def GetPage(url, num):
22     Url = url
23     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
24     headers = { 'User-Agent' : user_agent }
25     req = request.Request(Url, headers = headers)
26     page = request.urlopen(req).read().decode('utf-8')
27     soup = bs(page, "html.parser")
28     table = soup.table
29     tag = table.find_all('tr')
30     
31     # 提取出所需的那段
32     soup2 = bs(str(tag), "html.parser")
33     
34     title = soup2.find_all('a','t')         #标题与url 
35     companyName = soup2.find_all('a','sellername') #公司名称
36  
37     atitle = []
38     ahref = []
39     acompanyName = []
40  
41     for i in title:
42         atitle.append(i.get_text())
43         ahref.append(i.get('href'))
44     for i in companyName:
45         acompanyName.append(i.get_text())
46     for i in range(len(ahref)):
47         getSonPage(str(ahref[i]))
48         
49             
50 def getSonPage(url): 
51     Url = url
52     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
53     headers = { 'User-Agent' : user_agent }
54     req = request.Request(Url, headers = headers)
55     page = request.urlopen(req).read().decode('utf-8')
56     soup = bs(page, "html.parser")
57     print("=========================")
58     #类别
59     print(soup.find('div','su_con').get_text())
60     #服务区域
61     print(soup.find('div','su_con quyuline').get_text())
62     #联&nbsp;系&nbsp;人
63     print(soup.find_all('ul','suUl')[0].find_all('li')[2].find_all('a')[0].get_text())
64     #商家地址
65     print(soup.find_all('ul','suUl')[0].find_all('li')[3].find('div','su_con').get_text().replace("\n",'').replace("\r",'').replace('\t','').replace('&nbsp;',''))
66     #服务项目
67     print(soup.find('article','description_con').get_text().replace("_____________________________________","\n\r").replace("___________________________________","\n\r").replace("(以下为公司北京区域分布图)",""))
68     print("=========================")
69  
70 if __name__ == '__main__':
71     GetAllLink()

 

test7.py

1 import pymysql
2 conn = pymysql.connect(host='192.168.1.102', port=3306,user='root',passwd='123456',db='test',charset='UTF8')
3 cur = conn.cursor()
4 cur.execute("select version()")
5 for i in cur:
6     print(i)
7 cur.close()
8 conn.close()

 

posted on 2017-08-08 15:26  风又奈何  阅读(916)  评论(0)    收藏  举报