cdxing In solitude, where we are least alone

python 爬虫基本示例

示例一:

import urllib.request
url = "https://xxxxxx"

req = urllib.request.Request('http://www.baidu.com')
response = urllib.request.urlopen(req)
 
 
print(response.info())
print(response.getcode())
print(response.geturl())

示例二:

#中英文翻译

import urllib.request
import urllib.parse
import json
import sys


def open_url():
    data = {}
    url = "http://fy.iciba.com/ajax.php?a=fy"
    data['f'] = 'auto'
    data['t'] = 'auto'
    data['w'] = content
    data = urllib.parse.urlencode(data).encode('UTF-8')
    # response = urllib.request.urlopen(url,data)
    req = urllib.request.Request(url,data)
    response = urllib.request.urlopen(req)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2132.2 Safari/537.36X-Requested-With:XMLHttpRequest')  
    html = response.read()
    
    return html

    
def english(html):
    """
    英文翻译
    """
    html = html.decode('utf-8')
    target = json.loads(html)
#     print(target)
    try:
        print("翻译结果: %s" % target['content']['word_mean'][0])
        
    except :
        print("翻译结果: %s" % target['content']['out'])
    


def chin(html):
    """
    中文翻译
    """   
    html = html.decode('utf-8')
    target = json.loads(html)
#     print(target)
    print("翻译结果: %s" % target['content']['out'])

#python 判断是不是中文字
def Chinese(str):
    str1 = None
    for i in str.split():
     
        if i >= '\u4e00' and i<= '\u9fa5':
            str1 = True
    if str1:
        return True
    
    else:
        return False        

if __name__ == "__main__":
    
    while True:
        content = input("请输入需要翻译的内容(退出q): ").strip()
        
        if content == 'q':
            sys.exit()
        
        else:
            if  Chinese(content):
                chin(open_url())
           
            else:
                english(open_url())
          

示例三:

#图片下载例子

import urllib.request
import re

def open_url(url):
    req = urllib.request.Request(url)
    req.add_header('User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2132.2 Safari/537.36X-Requested-With:XMLHttpRequest')
    page = urllib.request.urlopen(req)
    html = page.read().decode('utf-8')
    
    return html


def get_img(html):
    
    p = r'<img class="BDE_Image" src="([^"]+\.jpg)"'
    imglist = re.findall(p, html)

    
    for each in imglist:
        print(each)
        
        
    for each in imglist:
        filename = each.split('/')[-1]
        
        #图片下载
       urllib.request.urlretrieve(each, filename, None)
        
if __name__ == '__main__':
    url = "http://tieba.baidu.com/p/3563409202"
    get_img(open_url(url))

 

示例四:

#匹配IP地址例子

import urllib.request
import re

def open_url(url):
    req = urllib.request.Request(url)
    req.add_header('User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2132.2 Safari/537.36X-Requested-With:XMLHttpRequest')
    page = urllib.request.urlopen(req)
    html = page.read().decode('utf-8')

    return html


def get_img(html):
    
    p = r'(?:(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])'
    iplist = re.findall(p, html)
    
    for each in iplist:
        print(each)
        
        
        
if __name__ == '__main__':
    url = "http://www.xicidaili.com"
    get_img(open_url(url))

 

posted @ 2017-07-07 16:22  cdxing  阅读(261)  评论(0)    收藏  举报