import urllib2
from BeautifulSoup import BeautifulSoup
import random
import time
def checkIndex(url):
url = url.replace('http://', '')
baiduUrl = 'http://www.baidu.com/s?wd=' + url
webPage = urllib2.urlopen(baiduUrl)
webCont = webPage.read()
webCont = webCont.replace('<b>', '').replace('</b>', '')
soup = BeautifulSoup(webCont)
findlist = soup.find('span', {'class': 'g'})
if findlist:
for each in findlist:
if url in unicode(each):
return url
else:
return None
else:
return None
waittime = random.randint(1, 20)
urllist = open('list.txt')
res = open('check.txt', 'w')
for eachurl in urllist.readlines():
indexurl = unicode(checkIndex(eachurl)) + '\n'
res.write(indexurl)
time.sleep(waittime)
urllist.close()
res.close()
print 'over!'