# -*- coding: UTF-8 -*-
import re
from bs4 import BeautifulSoup
import requests
import codecs
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def mei_url():
url = 'http://mdl.com/product'
web_data = requests.get(url)
web_data.encoding = 'utf-8'
soup = BeautifulSoup(web_data.text, 'lxml')
return soup
def mei_info(sub_url='/product/item/293410'):
url = 'http://mdl.com'+sub_url
web_data = requests.get(url)
web_data.encoding = 'utf-8'
soup = BeautifulSoup(web_data.text, 'lxml')
title=soup.select('#main > div.boundary > div > div.container__main > div.section.section-info.clearfix > h2')[0].get_text()
introduce=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text')[0].get_text()
effect=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text > span')[0].get_text()
crowd=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text')[2].get_text()
print title
with codecs.open(r'E:\note\mei_infov3.txt', "a+",'utf8') as file:
file.write('&'.join(map(lambda x:str(x),[title,introduce,effect,crowd])))
file.write('\n')
file.write('$')
if __name__=='__main__':
# items=mei_url()
# items=str(items)
soup1 = BeautifulSoup(open(r'E:\note\mei.htm'),'lxml')
items1=str(soup1)
url_list1=re.findall(r'/product/item/\d{6}',items1 )
soup2 = BeautifulSoup(open(r'E:\note\mei2.htm'),'lxml')
items2=str(soup2)
url_list2=re.findall(r'/product/item/\d{6}',items2 )
url_list3=url_list1+url_list2
print len(url_list3)
for sub_url in url_list3:
mei_info(sub_url)