python练习---小脚本

一.爬子域名

#!/usr/bin/python 
# -*- coding: utf-8 -*-

import requests
import re
import sys
def get(domain):
        url = 'http://i.links.cn/subdomain/'
#        payload = ("domain=ycxy.com&b2=1&b3=1&b4=1")
        payload = ("domain={domain}&b2=1&b3=1&b4=1".format(domain=domain))
        r = requests.post(url=url,params=payload)
        con = r.text
        a =re.compile('value="(.+?)"><input')   #正则匹配引号里的任何字符,非贪婪
        result = a.findall(con)
        for i in result:
            print i

if __name__ == '__main__':
    command =sys.argv[1:]                       #取所有后面的参数
    f ="".join(command)                          #用空格连接
    get(f)

二.爬I春秋精华页标题

#!/usr/bin/python 
#coding=GBK


import requests
import re
def gethtml():
    url = 'https://bbs.ichunqiu.com/portal.php'
    headers = {
        'Host': 'bbs.ichunqiu.com',
        'Connection': 'close',
        'Cache-Control': 'max-age=0',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8',
    }
    r = requests.get(url=url,headers=headers)
    html = r.content
    title = re.findall(r'target="blank" class="ui_colorG" style="color: #555555;">(.*?)</a></h3>', html)
    for i in title:
        print i

    # return html
s =gethtml()

# a =re.findall(r'target="blank" class="ui_colorG" style="color: #555555;">(.*?)</a></h3>',s)
# for i in a:
#     print(i)

三.爬妹子图片

#!/usr/bin/python 
# -*- coding: utf-8 -*-

import requests,re,sys
import urllib

def getimg():
    for x in range(1,298):
        url = 'http://www.7160.com/xingganmeinv/list_3_'+str(x)+'.html'
        r =requests.get(url=url)
        con = r.content
        # result = re.findall(r'<span class="bom_z">(.*?)</span></a></li>',con)
        tu = re.findall(r'<img src="(.+?)" alt="',con)
        # for i in result:
        #     print i
        # for j in tu:
        #     print j
        xx = 0
        for n in tu:
            tu.append(n)
            urllib.urlretrieve(n,'d:/meinv/%s.jpg'%xx)
            xx=xx+1


if __name__ == '__main__':
    getimg()

三.百度URL采集

#!/usr/bin/python
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
import sys
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


headers={

    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0",

    'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',

    'Connection' : 'keep-alive',

    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

    'X-Forwarded-For':'120.239.169.74'
}

def url(key):

    for i in range(0,10,10):
        bd_search="https://www.baidu.com/s?word=%s=&pn=%s"% (key,str(i))
        # bd_search = "https://bbs.ichunqiu.com/thread-40592-1-1.html"
        r =requests.get(bd_search,headers=headers,verify=False,timeout=2)
        s= r.text
        # result = re.findall(r'.t > a',s)
        # print s.encode('utf-8')
        soup=BeautifulSoup(s,"lxml")

        url_list=soup.select(".t > a")   #对请求回来的内容进行查找,找出a标签里(URL链接)
        # print url_list
        for url in url_list:
            real_url=url['href']   #遍历循环,并且打印
            try:
                r=requests.get(real_url,headers=headers,verify=False,timeout=2)  #再次请求

                print(r.url)  #打印出URL链接
                print key
            except Exception as e:

                print(e)
# url('sss')
if __name__ == '__main__':
    command = sys.argv[1:]
    canshu = "".join(command)#加上参数

    url(canshu)
posted @ 2018-05-21 18:28  晓枫v5  阅读(387)  评论(0编辑  收藏  举报