1. 用正则表达式判定邮箱是否输入正确。

  r = '^(\w)+(\.\w+)*@(\w)+((\.\w{2,3}){1,3})$'
    e = '23456521@qq.com'
    if re.match(r, e):
        print(re.match(r, e).group(0))
    else:
        print('error')

2. 用正则表达式识别出全部电话号码。

9. 取出一个新闻列表页的全部新闻 包装成函数def getListPage(pageUrl):

11. 获取全部新闻列表页的全部新闻详情。

def AlltitleAndUrl(url):
    reslist = requests.get(url)
    reslist.encoding = 'utf-8'
    soup_list = BeautifulSoup(reslist.text, 'html.parser')
    head = 'http://news.gzcc.cn/html/tongzhigonggao/'
    pages = ''
    tail = '.html'

    for news in soup_list.select('li'):  # 首页
        if len(news.select('.news-list-title')) > 0:
            # 标题
            title = news.select('.news-list-title')[0].text
            # 单位
            company =news.select('div .news-list-info > span')[1].text
            # 链接
            href = news.select('a')[0]['href']
            #时间
            time =news.select('div .news-list-info > span')[0].text

            print('\n')
            print("文章标题:" + title)
            print("文章链接:" + href)
            print("发布地址:"+company)
            print("发布时间:" + time)


    #匹配所以的电话
    for a in soup_list.select('#footer'):
        print(re.findall('(\d{3,4})-(\d{6,8})', a.select('div.container')[0].text))

    #其余的页面内容
    for i in range(2, 95):
        pages = i;
        nexturl = '%s%s%s' % (head, pages, tail)
        newcontent = requests.get(nexturl)
        newcontent.encoding = 'utf-8'
        soup_alllist = BeautifulSoup(newcontent.text, 'html.parser')
        for news in soup_alllist.select('li'):
            if len(news.select('.news-list-title')) > 0:
                # 标题
                title = news.select('.news-list-title')[0].text
                # 单位
                company = news.select('div .news-list-info > span')[1].text
                # 链接
                href = news.select('a')[0]['href']
                # 时间
                time = news.select('div .news-list-info > span')[0].text

                print('\n')
                print("文章标题:" + title)
                print("文章链接:" + href)
                print("发布地址:" + company)
                print("发布时间:" + time)

3. 用正则表达式进行英文分词。re.split('',news)

news = '''Process finished with exit code'''
word = re.split('[\s,.?\-]+',news)
print(word)

4. 使用正则表达式取得新闻编号

5. 生成点击次数的Request URL

newsUrl = 'http://news.gzcc.cn/html/2017/xiaoyuanxinwen_095/8249.html'
num=re.search('\_(.*).html',newsUrl).group(1)
print(num)
newsId = re.search('\_(.*).html', newsUrl).group(1).split('/')[-1]
res = 'http://oa.gzcc.cn/api.php?op=count&id=8249&modelid=80'.format(newsId)
print(res)

6. 获取点击次数

 HitUrl = 'http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80'
    hitNumber = requests.get(HitUrl).text.split('.html')[-1].lstrip("('").rstrip("');")
    print("点击次数:", hitNumber)

7. 将456步骤定义成一个函数 def getClickCount(newsUrl):

def getClickCount():
    HitUrl = 'http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80'
    hitNumber = requests.get(HitUrl).text.split('.html')[-1].lstrip("('").rstrip("');")
    print("点击次数:", hitNumber)

    re.match('http://news.gzcc.cn/html/2018/xiaoyuanxinwen(.*).html', url).group(1).split('/')[1]
    print('新闻编号:', re.search('\_(.*).html', url).group(1))

8. 将获取新闻详情的代码定义成一个函数 def getNewDetail(newsUrl):

def getNewDetail(url):  # 获取一页的详细新闻
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')

    for news in soup.select('li'):
        if len(news.select('.news-list-title')) > 0:
            # 首页文章标题
            title = news.select('.news-list-title')[0].text
            # 首页文章描述
            description = news.select('.news-list-description')[0].text
            # 首页文章信息
            info = news.select('.news-list-info')[0].text
            # 首页文章链接
            href = news.select('a')[0]['href']

            url = href
            res = requests.get(url)
            res.encoding = 'utf-8'
            soup = BeautifulSoup(res.text, 'html.parser')

            # 获取每篇文章的信息
            newinfo = soup.select('.show-info')[0].text

            # 获取文章内容
            content = soup.select('#content')[0].text
            # 日期
            date = newinfo.split()[0]
            # 当日时间
            time = newinfo.split()[1]
            # 作者
            author = newinfo.split()[2]
            # 审核
            checker = newinfo.split()[3]
            # 来源
            source = newinfo.split()[4]
            # 摄影
            Photography = newinfo.split()[5]
            print("文章标题:" + title)
            print("\n文章描述:" + description)
            print("\n文章信息:\n" + date + ' ' + time + '\n' + author + '\n' + checker + '\n' + source + '\n' + Photography)
            getClickCount(href)  # 点击次数、新闻编号
            print("\n文章链接:" + href)
            print(content)

10. 获取总的新闻篇数,算出新闻总页数包装成函数def getPageN():

def getPageN():
    res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    pagenumber=int(soup.select('.a1')[0].text.rstrip(''))
    page = int(soup.select('.a1')[0].text.rstrip(''))//10+1
    return page

 

posted on 2018-04-10 21:10  224杨晓潮  阅读(97)  评论(0编辑  收藏  举报