清风

导航

2017-11-11 Sa Oct Spider

2017-11-11 Sa Oct Spider

4:33 PM

Again.

Firstly test liburl:

# -*- coding: utf-8 -*-

import json
import datetime
import HTMLParser  
import urlparse  
import urllib  
import urllib2  
import cookielib  
import string  
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable

reload(sys)
sys.setdefaultencoding("utf-8")

def openWithBrowser(filename):
    os.system('python -m webbrowser "{}"'.format(filename))

name = 'xxx'
no = 'xxx'

hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
urllib2.install_opener(opener)
h = urllib2.urlopen(hosturl)

headers = {
    'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
    'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
}

idx = name + ' ' + no

postData = {
    '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
    '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
    'name' : name,
    'pwd' : '12345',
    'btnchange' : '登录',
    'xuehao' : no
}

postData = urllib.urlencode(postData)
request = urllib2.Request(posturl, postData, headers)
response = urllib2.urlopen(request, timeout=5)

with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
    f.write(response.read())
    openWithBrowser(f.name)

Good. Nothing changed. Them apply the table.

5:09 PM

# -*- coding: utf-8 -*-

import json
import datetime
import HTMLParser  
import urlparse  
import urllib  
import urllib2  
import cookielib  
import string  
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable

reload(sys)
sys.setdefaultencoding("utf-8")

def openWithBrowser(filename):
    os.system('python -m webbrowser "{}"'.format(filename))

version = datetime.datetime.now().strftime("%y-%m-%d %a %b %H-%M-%S result")
os.mkdir(version)

hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
urllib2.install_opener(opener)
h = urllib2.urlopen(hosturl)

headers = {
    'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
    'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
}

def get(name, no):
    global hosturl, posturl, cj, cookie_support, opener, h, headers

    postData = {
        '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
        '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
        'name' : name,
        'pwd' : '12345',
        'btnchange' : '登录',
        'xuehao' : no
    }

    postData = urllib.urlencode(postData)
    request = urllib2.Request(posturl, postData, headers)
    response = urllib2.urlopen(request, timeout=5)

    with open('{}/{}.html'.format(version, no), 'w') as f:
        f.write(response.read().replace('<head>', '<head><meta charset="utf-8">'))

with open('result_utf8.csv', "rb") as f:
    print version
    for line in f:
        (name, no, x1, x2) = line.split(',')
        try:
            get(name, no)
        except:
            pass

It took some time to output to Chinese filename. Gave up eventually. It even raised exception when I printed name (Chinese) to the console (decode stuff).

Then I'd write a reporter.

6:41 PM

# -*- coding: utf-8 -*-

import json
import datetime
import HTMLParser  
import urlparse  
import urllib  
import urllib2  
import cookielib  
import string  
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable
import Tkinter

reload(sys)
sys.setdefaultencoding("utf-8")

csv = [line.split(',') for line in open('result_utf8.csv')]

def getname(no):
    for i in csv:
        if i[1] == no:
            return i[0]
    return ''

def getcourse(filename):
    s = open(filename).read()
    i = s.find('退选')

    if i != -1:
        trbegin = s.find('<tr>', i)

    # s[trbegin...] e.g.
    # <tr>
    #                                         <td width="10%">
    #                                     <a id="GridView1_ctl02_LinkButton1" href="ja
    # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton1&#39;,&#39;&#39;)">348</a>
    # 
    #                                 </td><td>12</td><td>生物培优班</td><td>xxx</td><td>&n
    # bsp;</td><td width="10%">
    #                                     <a id="GridView1_ctl02_LinkButton2" href="ja
    # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton2&#39;,&#39;&#39;)">退选</a>
    #                                </td>

        trend = s.find('</tr>', trbegin)

        read = 0
        res = ''

        i = trbegin
        while i < trend:
            if s[i] == '<':
                while s[i] != '>':
                    i += 1
                i += 1
                continue

            end = False
            while s[i] != '<':
                if s[i] == '&':
                    end = True
                    break
                res += s[i]
                i += 1

            if end:
                break

            res += ' '
        
        res2 = ''
        i = 0
        while i < len(res) and not (res[i] in "0123456789"):
            i += 1

        while i < len(res):
            if res[i] == '\n':
                i += 1
            else:
                res2 += res[i]
                if res[i] == ' ':
                    while i < len(res) and res[i] == ' ':
                        i += 1
                else:
                    i += 1

        return res2
    return ''

def report():
    with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:  
        wd = workdir.get()

        os.chdir(wd)
        f.write('<head><meta charset="utf-8"></head>')
        f.write('<h1>Spider report</h1>')
        f.write('<p><b>Version {}</b></p>'.format(wd))
        f.write('<table>')

        for i in os.listdir('.'):
            (no, x1) = i.split('.')
            name = getname(no)
            s = getcourse(i)
            f.write('<tr><th>{}</th><th>{}</th><td>{}</td></tr>'.format(no, name, s))

        os.system('python -m webbrowser {}'.format(f.name))
        os.chdir('..')

gui = Tkinter.Tk()
workdir = Tkinter.StringVar()
Tkinter.Button(gui, text="Report", command=report).pack(side=Tkinter.LEFT)
Tkinter.Entry(gui, textvariable=workdir, width=40).pack(side=Tkinter.LEFT)
gui.mainloop()

posted on 2017-11-11 18:43  清风2009  阅读(180)  评论(0编辑  收藏  举报