Python实现随机读取文本N行数据

工作中需要判断某个文本中的URL是否能正常访问,并且随机获取其中N行能正常访问的URL数据,我的思路是:读取文本每一行数据,用urlopen访问,将返回状态码为200的URL保存到一个列表,获得列表长度,使用random产生一个随机值作为列表下标,获取该行数据。具体实现如下:

 1 import urllib2,random
 2 from sets import Set
 3 
 4 def get_responses(url):
 5     global good_list
 6     global bad_list
 7     if not url.startswith("http:"):
 8         http_url = "http://" + url
 9     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1',}
10     try:
11         request = urllib2.Request(http_url, headers=headers)
12         resp = urllib2.urlopen(request)
13         print url
14     except urllib2.URLError, e:
15         print e
16         bad_list.append(url)
17         return 0
18 
19     retcode = resp.getcode()
20     if retcode == 200:
21         good_list.append(url)
22         #return 1
23     else:
24         bad_list.append(url)
25         #return 0
26 
27 def readFile():
28     try:
29         urllist = open(r'C:\Users\888\Desktop\urls.txt','r')
30     except IOError:
31         print "file does not exist.\n"
32     for item in urllist:
33         item = item.strip('\n')
34         r = get_responses(item)
35 
36     urllist.close()
37     print "Total URLs: %d, Good URLs:%d, Bad URLs: %d." %((len(good_list)+len(bad_list)),len(good_list),len(bad_list))
38     
39 def writeFile(linenum):
40     result = []
41     linelen = len(good_list)
42     while len(result) < int(linenum):
43         s = random.randint(0,linelen-1)
44         result.append(good_list[s])
45         result = list(Set(result))
46         
47     # Put the good_url in goodurl.txt file
48     try:
49         goodurl = open(r'C:\Users\888\Desktop\goodurl.txt','w+')
50     except IOError:
51         print "file does not exist.\n"
52 
53     for item in result:
54         goodurl.write(item+'\n')
55     goodurl.close()
56 
57     print "The mission is done, Please check the goodurl.txt file"
58     
59 if __name__ == "__main__":  
60     good_list = []
61     bad_list = []
62     readFile()
63     writeFile(150)

 

posted @ 2014-10-13 14:34  bamb00  阅读(7459)  评论(0编辑  收藏  举报