• 博客园logo
  • 会员
  • 众包
  • 新闻
  • 博问
  • 闪存
  • 赞助商
  • HarmonyOS
  • Chat2DB
    • 搜索
      所有博客
    • 搜索
      当前博客
  • 写随笔 我的博客 短消息 简洁模式
    用户头像
    我的博客 我的园子 账号设置 会员中心 简洁模式 ... 退出登录
    注册 登录
james1207

博客园    首页    新随笔    联系   管理    订阅  订阅

Python Post and Get 登陆web后台系统并抓取页面

#coding=utf8
#! /usr/bin/env python

import httplib
import re
import socket
import urllib

timeout = 60
socket.setdefaulttimeout(timeout)


def getTable():

    f = open('kvpage.html')
    page = f.readlines()
    f.close()
    pattern = re.compile(r'.*<tbody>(.*?)</tbody>.*')
    
    for line in page:
        #print line
        m = pattern.match(line.strip())
        if m is not None:
            return m.group(1)
    
    return None

def extractKvEvents(content):
    
    #init result
    table = []
    
    #init pattern
    patternTR = re.compile(r"<tr>(.*?)</tr>")
    patternTD = re.compile(r'<td class="confluenceTd">(.*?)</td>')
    
    #search all the rows
    allrows = patternTR.findall(content)
    if allrows is not None:
        for row in allrows:
            #print row
            cols = patternTD.findall(row)
            if cols is not None:
                
                table.append(cols)
            
    return table

def outputToExcel(table):
    for row in table:
        print row

def loginWiki():

    httpClient = None
    try:
        params = urllib.urlencode({'os_username': 'xxxx@xxx.com',
                                   'os_password': 'xxxx', 
                                   'login': 'Log In'})
        
        headers = {"Content-type": "application/x-www-form-urlencoded"
                        , "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
    
        httpClient = httplib.HTTPConnection("xxx.com", 8080, timeout=30)
        httpClient.request("POST", "/login.action", params, headers)
    
        response = httpClient.getresponse()
#         print response.status
#         print response.reason
#         print response.read()
#         print response.getheaders()
        print response.getheader('Set-Cookie')
        cookieFile = open('cookie.txt', 'w')
        cookieFile.write(response.getheader('Set-Cookie'))
        cookieFile.close()
    except Exception, e:
        print e
    finally:
        if httpClient:
            httpClient.close()

def catchPage():
    httpClient = None

    try:
        #read cookie
        f = open('cookie.txt')
        cookie = f.read().strip()
        print cookie
        f.close()
        
        #init headers
        headers = {"Content-type": "application/x-www-form-urlencoded",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                    'Cookie': cookie}
    
        #send request
        httpClient = httplib.HTTPConnection('xxx.com', 8080, timeout=30)
        httpClient.request('GET', '/xxxPath', headers=headers)
    
        #response是HTTPResponse对象
        response = httpClient.getresponse()
        print response.status
        print response.reason
        
        htmlPage = open('kvpage.html', 'w')
        htmlPage.write(response.read())
        htmlPage.close()
    except Exception, e:
        print e
    finally:
        if httpClient:
            httpClient.close()

if __name__ == '__main__':

    loginWiki()
    catchPage()
    tablecontent = getTable()
    table = extractKvEvents(tablecontent) 
    outputToExcel(table)
 
    


posted @ 2013-10-29 21:20  Class Xman  阅读(461)  评论(0)    收藏  举报
刷新页面返回顶部
博客园  ©  2004-2025
浙公网安备 33010602011771号 浙ICP备2021040463号-3