Python之爬取网页的一个例子

import time,random
import urllib2,urllib,socket,re
from bs4 import BeautifulSoup
import cx_Oracle



conn = cx_Oracle.connect('xxx/xxx')
try:
    cursor = conn.cursor()
    cursor.execute('create table tb_user(id varchar2(50), name varchar2(50),password varchar(50))')
except:
    print "wwwwwwww"
x = 0
my_dh = 0

def crawl(url):

    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', }
    req = urllib2.Request(url, headers=headers)
    page = urllib2.urlopen(req, timeout=60)
    contents = page.read()

    soup = BeautifulSoup(contents, 'html.parser')
    my_title = soup.select(".des h2 a")
    file = open('E:\Python\\text.txt', 'a')

    # for i,z in zip(my_title,my_title2):
    #     b = i.text.strip()
    #     d = z.text.strip()
    #     # w = c.text.strip()
    #     n =  b+''+d
    #     print n


    for phone in my_title:
        time.sleep(random.random()*5)
        url2 = phone['href']
        html = urllib2.urlopen(url2).read()
        soup2 = BeautifulSoup(html, 'html.parser')
        my_dh = soup2.select('.phone-num')
        if len(my_dh)>0:
            my_dh1=my_dh[0].text
        else:
            my_dh1= 'null'
            #continue

        my_man = soup2.select('.c_000')
        if len(my_man)>0:
            my_man1 = soup2.select('.c_000')[0].text
            my_bt = soup2.select('.c_333.f20')[0].text
            my_money = soup2.select('.c_ff552e')[0].text
            massage = url2 +'  '+ my_man1+'  '+my_dh1+'  '+my_bt + my_money

            print massage
            param = {'id': url2, 'n': my_man1, 'p': my_dh1}
            cursor.execute('insert into tb_user values(:id,:n,:p)', param)
            conn.commit()
            print param
            file.write(massage.encode('utf-8') + '\n')



        else:

            continue


for page in range(1, 100):
    page += 1
    url = 'http://cc.58.com/chuzu/pn{}'.format(page)
    crawl(url)

能够将部分网页数据提取出来形成TXT文档。导入数据库时是每提取一条信息便导入oracle数据库。而导入文档时，则是提取一定数量的数据才会进行一次导入。

其中还有一些小毛病需要改善。

posted on 2017-06-14 16:57 勤学才是王道阅读(556) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

勤学才是王道

Python之爬取网页的一个例子

导航

公告