继续潜水

导航

 

目前完成了我爱我家和链家的房源信息获取,写了比较的粗糙~

houseWoaiwojia.py:

#encoding=utf-8
import result as r
import db
import datetime
import houseLianjiaVo as vo
import json
import time



def getdata(url):
    soup = r.getUrl(url)
    pList = soup.find_all("ul", class_="pList")
    all_li = pList[0].find_all("li")
    print("url:%s     li数量:%i"%(url,len(all_li)))
    list = []
    for i in all_li:
        listTit = i.find_all("h3", class_="listTit")
        if len(listTit) == 0 :
            continue
        title_a = listTit[0].find_all("a")[0]
        lazy_img = i.find_all("img", class_="lazy")[0]
        img = ""
        if "src" in lazy_img.attrs:
            img =  lazy_img["src"]
        elif "data-src" in lazy_img.attrs:
            img =  lazy_img["data-src"]
        href = domain + title_a["href"]

        code = json.loads(title_a["tdjson"])["content"]
        if code in arrayList:
            #print("code 已经存在")
            continue
        title = title_a.getText()
        try:
            listX_p = i.find_all("div",class_ = "listX")[0].find_all("p")
            houseInfo = listX_p[0].getText()
            region2 = listX_p[1].getText().split(" ")[0]
            region1 = listX_p[1].find_all("a")[0].getText()
            listX_p_2_test = listX_p[2].getText().split("·")
            release_time = listX_p_2_test[2]
            total_price = i.find_all("div", class_="jia")[0].find_all("strong")[0].getText()
            data_price  = i.find_all("div", class_="jia")[0].find_all("p")[1].getText()

            house_structure = total_square = orientation = decoration_degree = floor = material = ""
            if houseInfo != None :
              h =  houseInfo.split("·")
              if len(h) > 0 :
                  # 房屋结构
                  house_structure = h[0].strip()
                  # 总平方数
                  total_square = h[1].strip()
                  # 朝向
                  orientation = h[2].strip()
                  # 装修程度
                  decoration_degree = h[4].strip()
                  # 楼层
                  floor = h[3].strip()
                  # 材料
                  material = h[5].strip()
            create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            v = vo.houseLianjiaVo(code,img,title,href,region1,region2,
                              house_structure,total_square,orientation,decoration_degree,floor,material,
                              create_time,release_time,total_price,data_price)
            list.append(v)
            arrayList.append(code)
        except BaseException as Argument :
            print("失败,url:%s,title:%s",url,title,Argument)

    return  list



def encapsulation_db(list):
    """dept表sql封装"""
    sql = """ insert into house  values"""
    for i in range(len(list)):
        d = list[i]
        s = """(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s'),"""%(
            house_type,
            d.code,
            d.img,
            d.title,
            d.href,
            d.region1,
            d.region2,
            d.house_structure,
            d.total_square,
            d.orientation,
            d.decoration_degree,
            d.floor,
            d.material,
            d.create_time,
            d.release_time,
            d.total_price,
            d.data_price
        )
        sql += s
    sql = sql [:len(sql)-1] + ";"
    return sql





def date_util(date):
    print(date/1000)
    #时间有点问题差8小时
    dateArray = datetime.datetime.utcfromtimestamp(date/1000)
    otherStyleTime = dateArray.strftime("%Y-%m-%d %H:%M:%S")
    return otherStyleTime

arrayList = []
domain = "https://hz.5i5j.com"
house_type ="我爱我家"
if __name__ == '__main__':
    db = db.DataBaseHandle('127.0.0.1', 'root', '1qaz@WSX', 'house', 3306)
    publicity_list = []
    list = db.selectDb("select code from house where type ='%s'  "%house_type)
    for i in list :
        arrayList.append(i[0])

    for i in range(1,101):
        page = "n%s"%i
        url = domain + "/ershoufang/" + page
        list = getdata(url)
        if len(list) > 0 :
            sql =encapsulation_db(list)
            try:
                db.insertDB(sql)
                print("插入数据库成功%s",page)
            except BaseException:
                print("插入数据库失败%s",page)

houseLianjia.py:

#encoding=utf-8
import result as r
import db
import datetime
import houseLianjiaVo as vo
import time



def getdata(url):
    soup = r.getUrl(url)
    # print(soup)
    sellListContent = soup.find_all("ul", class_="sellListContent")
    all_li = sellListContent[0].find_all("li")
    print("url:%s     li数量:%i"%(url,len(all_li)))
    list = []
    for i in all_li:
        title_a = i.find_all("div", class_="title")[0].find_all("a")[0]
        img = i.find_all("img", class_="lj-lazy")[0]["src"]
        href = title_a["href"]
        code = title_a["data-housecode"]
        if code in arrayList:
            #print("code 已经存在")
            continue
        title = title_a.getText()
        try:
            # print(title_a)
            positionInfo_a = i.find_all("div", class_="positionInfo")[0].find_all("a")
            region1 = positionInfo_a[0].getText()
            region2 = positionInfo_a[1].getText()
            houseInfo  = i.find_all("div", class_="houseInfo")[0].getText()
            house_structure = total_square = orientation = decoration_degree = floor = material = ""
            if houseInfo != None :
              h =  houseInfo.split("|")
              if len(h) > 0 :
                  # 房屋结构
                  house_structure = h[0]
                  # 总平方数
                  total_square = h[1]
                  # 朝向
                  orientation = h[2]
                  # 装修程度
                  decoration_degree = h[3]
                  # 楼层
                  floor = h[4]
                  # 材料
                  material = h[5]

            release_time = i.find_all("div" , class_ = "followInfo")[0].getText().split("/")[1]
            total_price = i.find_all("div", class_="totalPrice")[0].find_all("span")[0].getText()
            data_price = i.find_all("div", class_="unitPrice")[0]["data-price"]
            create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            v = vo.houseLianjiaVo(code,img,title,href,region1,region2,
                              house_structure,total_square,orientation,decoration_degree,floor,material,
                                  create_time,release_time,total_price,data_price)
            list.append(v)
            arrayList.append(code)
        except BaseException:
            print("失败,url:%s,title:%s",url,title)

    return  list



def encapsulation_db(list):
    """dept表sql封装"""
    sql = """ insert into house  values"""
    for i in range(len(list)):
        d = list[i]
        s = """(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s'),"""%(
            house_type,
            d.code,
            d.img,
            d.title,
            d.href,
            d.region1,
            d.region2,
            d.house_structure,
            d.total_square,
            d.orientation,
            d.decoration_degree,
            d.floor,
            d.material,
            d.create_time,
            d.release_time,
            d.total_price,
            d.data_price
        )
        # print(s)
        sql += s
    sql = sql [:len(sql)-1] + ";"
    return sql




def date_util(date):
    print(date/1000)
    #时间有点问题差8小时
    dateArray = datetime.datetime.utcfromtimestamp(date/1000)
    otherStyleTime = dateArray.strftime("%Y-%m-%d %H:%M:%S")
    return otherStyleTime

arrayList = []
house_type = "链家"
if __name__ == '__main__':
    db = db.DataBaseHandle('127.0.0.1', 'root', '1qaz@WSX', 'house', 3306)
    publicity_list = []
    list = db.selectDb("select code from house where type ='%s'  " % house_type)
    for i in list :
        arrayList.append(i[0])

    for i in range(1,101):
        page = "pg%s"%i
        url = "https://hz.lianjia.com/ershoufang/" + page + "co32/"
        list = getdata(url)
        if len(list) > 0 :
            sql =encapsulation_db(list)
            try:
                db.insertDB(sql)
                print("插入数据库成功%s",page)
            except BaseException:
                print("插入数据库失败%s",page)

result.py

import requests
from bs4 import BeautifulSoup
import ip_list

# 消息头
# headers ={
# "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
# "Accept-Encoding":"gzip, deflate, br",
# "Accept-Language":"zh-CN,zh;q=0.9",
# "Cache-Control":"max-age=0",
# "Connection":"keep-alive",
# "Cookie":"TS013af1c1=01ef8f99f1d0871b269c53340fa6269029504a80c4e2e4fb5f8c141dc3ea1889dfe9b73e3546aaef50810eb7137614f90423824a3a; _gscu_415563124=710350909btrml20; _gscbrs_415563124=1; TS01dde381_77=087968f3e8ab280075ffa9551835e2b3bc9a0a6ef1f753105fc148025ab082fd79a1cb86cc3a8f52e22695c031dd504308dd5f77e7823800bc61bb50494d8e87319bf3b42d4db90a536ed3feb83a77f2f122231d05b1d9d2348a3ef88a547a3a4fa01ac0c30c12acae8ea546e6c1a1ba; TSf97de9a7_76=087968f3e8ab2800694c407f83260b5fb36f147a01070c43e22a21076b5ec514c7b53bce88b8138a25acd182ef87b3ba08ef9bf94f07e800607868f728157db181454561ee310058f9e829ec0810c6cdcb21744ee6aac2d22d2d391d9dec7ed93dd2cc97f0534a13176b017915a82198365ab759a9c450c111a80907ed69974e36be3d3b9a2329829301cd8625d168c2f1b3b00c879662fc185e5c040d86ecabf8d9fd0d7582082883f0e4517e9ed01aef8fa6c301b7e34fba91950ff8a73444c94299ebebf81d60a295b2b378cb7f282d8c42bde8c1c6278b6e33bec5e77c19753bc6bf5a685fefd3e5bc832bf7b228faa342f439fdc647c4c009f2c59d7051f66d584aecb72f84a3a0ae4ad34e90593f62365471bf182f873e90c607771894; TSPD_101=087968f3e8ab2800694c407f83260b5fb36f147a01070c43e22a21076b5ec514c7b53bce88b8138a25acd182ef87b3ba:087968f3e8ab2800694c407f83260b5fb36f147a01070c43e22a21076b5ec514c7b53bce88b8138a25acd182ef87b3ba08ef9bf94f06300055e7cf7b71c21fcf57bb0d7a08e541b632d1e81bc2b89a1a0b150eb4c70f05a351fc3a1b4aa2c87583b1593295915bf8; a6c1b8e3d8ee43f7b55efdb3b44bd46e=WyIzNDA2NzQ2MzE0Il0; TS01dde381=01ef8f99f13c1ae3080e65ae71621810e05b79c0c6aa9bdd4bdd59420bc12c88c9cf07ae6c9dbbd39b34b715438e2a022c50ce917f; TSf97de9a7_27=087968f3e8ab2000afda1f036834422803e5ea6f78d56a8329a6c70c885298f80c660dbeab022de90855e1bd82092000ce04d01001dd1ea60427ae89da80b6ff03d8eafda5efd5e059e62400b4741136",
# "Host":"www.12309.gov.cn",
# "If-None-Match":'W/"10d1c-m2GHDG7mOl/LWWDK4ZcVfpV31es"',
# "Sec-Fetch-Mode":"navigate",
# "Sec-Fetch-Site":"none",
# "Sec-Fetch-User":"?1",
# "Upgrade-Insecure-Requests":"1",
# "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36",
# }
headers ={}
# post请求发送json
def postUrl(url,json_data):
    result = requests.post(url ,json = json_data,headers=headers )
    return result.json()

# post请求发送json
def getUrlJson(url):
    result = requests.get(url ,headers=headers )
    return result.json()


def getUrl(url):
    """url返回BeautifulSoup对象"""
    # proxies = ip_list.get_ip_list_random()#代理ip
    # print(proxies)
    # content = requests.get(url,headers=headers,proxies=proxies).content
    content = requests.get(url,headers=headers).content
    soup = BeautifulSoup(content,"html.parser")
    return soup

db.py:

import pymysql


class DataBaseHandle(object):
    ''' 定义一个 MySQL 操作类'''
    def __init__(self,host,username,password,database,port):
        '''初始化数据库信息并创建数据库连接'''
        # 下面的赋值其实可以省略,connect 时 直接使用形参即可
        self.host = host
        self.username = username
        self.password = password
        self.database = database
        self.port = port
        self.db = pymysql.connect(self.host,self.username,self.password,self.database,self.port,charset='utf8')



    #  这里 注释连接的方法,是为了 实例化对象时,就创建连接。不许要单独处理连接了。
    #
    # def connDataBase(self):
    #     ''' 数据库连接 '''
    #
    #     self.db = pymysql.connect(self.host,self.username,self.password,self.port,self.database)
    #
    #     # self.cursor = self.db.cursor()
    #
    #     return self.db





    def insertDB(self,sql):
        ''' 插入数据库操作 '''

        self.cursor = self.db.cursor()

        try:
            # 执行sql
            self.cursor.execute(sql)
            # tt = self.cursor.execute(sql)  # 返回 插入数据 条数 可以根据 返回值 判定处理结果
            # print(tt)
            self.db.commit()
        except Exception as ex:
            # 发生错误时回滚
            self.db.rollback()
            print("数据库异常",ex)
        finally:
            self.cursor.close()



    def deleteDB(self,sql):
        ''' 操作数据库数据删除 '''
        self.cursor = self.db.cursor()

        try:
            # 执行sql
            self.cursor.execute(sql)
            # tt = self.cursor.execute(sql) # 返回 删除数据 条数 可以根据 返回值 判定处理结果
            # print(tt)
            self.db.commit()
        except:
            # 发生错误时回滚
            self.db.rollback()
            print("数据库异常")
        finally:
            self.cursor.close()





    def updateDb(self,sql):
        ''' 更新数据库操作 '''

        self.cursor = self.db.cursor()

        try:
            # 执行sql
            self.cursor.execute(sql)
            # tt = self.cursor.execute(sql) # 返回 更新数据 条数 可以根据 返回值 判定处理结果
            # print(tt)
            self.db.commit()
        except:
            # 发生错误时回滚
            self.db.rollback()
            print("数据库异常")
        finally:
            self.cursor.close()





    def selectDb(self,sql):
        ''' 数据库查询 '''
        self.cursor = self.db.cursor()
        try:
            self.cursor.execute(sql) # 返回 查询数据 条数 可以根据 返回值 判定处理结果

            data = self.cursor.fetchall() # 返回所有记录列表

            print(data)

            # 结果遍历
            # for row in data:
            #     sid = row[0]
            #     name = row[1]
            #     # 遍历打印结果
            #     print('sid = %s,  name = %s'%(sid,name))
            return data
        except:
            print("数据库异常")
        finally:
            self.cursor.close()


    def closeDb(self):
        ''' 数据库连接关闭 '''
        self.db.close()



if __name__ == '__main__':
    DbHandle = DataBaseHandle('127.0.0.1','root','1qaz@WSX','test',3306)


    DbHandle.selectDb('select * from dept limit 5 ')
    DbHandle.closeDb()

实体封装houseLianjiaVo.py

class houseLianjiaVo:
    def __init__(self, code, img, title, href, region1, region2, house_structure, total_square, orientation,
                 decoration_degree, floor,material, create_time,release_time, total_price, data_price):
        self.id = None  # 自增id
        self.code = code  # 主键
        self.img = img  # 图片
        self.title = title  # 标题
        self.href = href  # 链接
        self.region1 = region1  # 小区名
        self.region2 = region2  # 区域
        self.house_structure = house_structure  # 房屋结构
        self.total_square = total_square  # 总平方数
        self.orientation = orientation  # 朝向
        self.decoration_degree = decoration_degree  # 装修程度
        self.floor = floor  # 楼层
        self.material = material  # 材料
        self.create_time = create_time  # 创建时间
        self.release_time = release_time  # 发布时间
        self.total_price = total_price  # 总价
        self.data_price = data_price  # 单价


def __str__(self):
    return 'code:%s  title:%s  ' % (self.code, self.title)

数据库表结构

CREATE TABLE `house` (
  `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT '自增id',
  `type` varchar(255) DEFAULT NULL,
  `code` varchar(32) DEFAULT NULL COMMENT '主键',
  `img` varchar(1000) DEFAULT NULL COMMENT '图片',
  `title` varchar(32) DEFAULT NULL COMMENT '标题',
  `href` varchar(256) DEFAULT NULL COMMENT '链接',
  `region1` varchar(32) DEFAULT NULL COMMENT '小区名',
  `region2` varchar(32) DEFAULT NULL COMMENT '区域',
  `house_structure` varchar(32) DEFAULT NULL COMMENT '房屋结构',
  `total_square` varchar(32) DEFAULT NULL COMMENT '总平方数',
  `orientation` varchar(32) DEFAULT NULL COMMENT '朝向',
  `decoration_degree` varchar(32) DEFAULT NULL COMMENT '装修程度',
  `floor` varchar(32) DEFAULT NULL COMMENT '楼层',
  `material` varchar(32) DEFAULT NULL COMMENT '材料',
  `create_time` varchar(32) DEFAULT NULL COMMENT '创建时间',
  `release_time` varchar(32) DEFAULT NULL COMMENT '发布时间',
  `total_price` varchar(32) DEFAULT NULL COMMENT '总价',
  `data_price` varchar(32) DEFAULT NULL COMMENT '单价',
  PRIMARY KEY (`id`) USING BTREE,
  KEY `idx_code` (`code`) USING BTREE COMMENT '编码索引'
) ENGINE=InnoDB AUTO_INCREMENT=14999 DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC COMMENT='房屋信息表-链家';

 

posted on 2020-05-04 21:13  继续潜水  阅读(653)  评论(0编辑  收藏  举报