python 爬虫基本示例

示例一：

import urllib.request
url = "https://xxxxxx"

req = urllib.request.Request('http://www.baidu.com')
response = urllib.request.urlopen(req)

print(response.info())
print(response.getcode())
print(response.geturl())

示例二：

#中英文翻译

import urllib.request
import urllib.parse
import json
import sys

def open_url():
    data = {}
    url = "http://fy.iciba.com/ajax.php?a=fy"
    data['f'] = 'auto'
    data['t'] = 'auto'
    data['w'] = content
    data = urllib.parse.urlencode(data).encode('UTF-8')
    # response = urllib.request.urlopen(url,data)
    req = urllib.request.Request(url,data)
    response = urllib.request.urlopen(req)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2132.2 Safari/537.36X-Requested-With:XMLHttpRequest')
    html = response.read()

    return html


def english(html):
    """
    英文翻译
    """
    html = html.decode('utf-8')
    target = json.loads(html)
#     print(target)
    try:
        print("翻译结果: %s" % target['content']['word_mean'][0])

    except :
        print("翻译结果: %s" % target['content']['out'])


def chin(html):
    """
    中文翻译
    """
    html = html.decode('utf-8')
    target = json.loads(html)
#     print(target)
    print("翻译结果: %s" % target['content']['out'])

#python 判断是不是中文字
def Chinese(str):
    str1 = None
    for i in str.split():

        if i >= '\u4e00' and i<= '\u9fa5':
            str1 = True
    if str1:
        return True

    else:
        return False

if __name__ == "__main__":

    while True:
        content = input("请输入需要翻译的内容(退出q): ").strip()

        if content == 'q':
            sys.exit()

        else:
            if Chinese(content):
                chin(open_url())

            else:
                english(open_url())

示例三：

#图片下载例子

import urllib.request
import re

def open_url(url):
    req = urllib.request.Request(url)
    req.add_header('User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2132.2 Safari/537.36X-Requested-With:XMLHttpRequest')
    page = urllib.request.urlopen(req)
    html = page.read().decode('utf-8')

    return html

def get_img(html):

    p = r'<img class="BDE_Image" src="([^"]+\.jpg)"'
    imglist = re.findall(p, html)


    for each in imglist:
        print(each)


    for each in imglist:
        filename = each.split('/')[-1]

        #图片下载
       urllib.request.urlretrieve(each, filename, None)

if __name__ == '__main__':
    url = "http://tieba.baidu.com/p/3563409202"
    get_img(open_url(url))

示例四：

#匹配IP地址例子

import urllib.request
import re

def open_url(url):
    req = urllib.request.Request(url)
    req.add_header('User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2132.2 Safari/537.36X-Requested-With:XMLHttpRequest')
    page = urllib.request.urlopen(req)
    html = page.read().decode('utf-8')

    return html

def get_img(html):

    p = r'(?:(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])'
    iplist = re.findall(p, html)

    for each in iplist:
        print(each)



if __name__ == '__main__':
    url = "http://www.xicidaili.com"
    get_img(open_url(url))

posted @ 2017-07-07 16:22 cdxing 阅读(266) 评论(0) 收藏举报

刷新页面返回顶部

cdxing

python 爬虫基本示例

公告