01
前言
二手房价格预测问题一直作为基础的数据分析入门课题,有许多开源的房价预测数据集。这些数据虽为经典,但时效上有所不足。因此我将在此记录Python从0到1的二手房房价预测过程,从数据获取开始。
02
获取单个二手房售卖链接
以链家网为例
我们需要获取对应城市的二手房售卖链接,图中红色框起来的就是一条链接。鼠标右键检查元素或F12,追踪各个售卖标题的url,根据`<a>`标签的属性:`"data-el": "ershoufang"`,我们可以利用BeautifulSoup对网页进行解析。但由于每个售卖链接有两个`<a>`标签,也就是说,获取出来的各个售卖链接有两条一样的,所以我们可以直接使用`set()`集合去重。
代码如下:
def getHouseId(url):
pageText = getPage(url)
soup = BeautifulSoup(pageText, 'lxml')
soupContent = soup.find_all(name="a", attrs={"data-el": "ershoufang"})
houseIdSet = set()
for a in soupContent:
houseIdSet.add(a.get("href"))
# print(a.get("href"))
return list(houseIdSet)
02
获取单个二手房售卖链接
进入到单个的二手房售卖页面,以下几张图中,红色框起来的部分是我们要获取的内容。我们使用同样的方法,定位需要获取元素的标签和属性。
def getHouseContent(url):
houseInfoDict = {}
pageText = getPage(url)
soup = BeautifulSoup(pageText, 'lxml')
# 简要信息
# 总价
price = soup.find(name="span", attrs={"class": "total"}).text
houseInfoDict['总价'] = price
# print(price)
# 单位价格
unitPrice = soup.find(name="span", attrs={"class": "unitPriceValue"}).text
houseInfoDict['单位价格'] = unitPrice
# print(unitPrice)
# 户型
# room = soup.find(name="div", attrs={"class": "room"})
# roomInfo = BeautifulSoup(str(room), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
# print(roomInfo)
# 楼层
# roomSubInfo = BeautifulSoup(str(room), 'lxml').find(name="div", attrs={"class": "subInfo"}).text
# print(roomSubInfo)
# 朝向
# type = soup.find(name="div", attrs={"class": "type"})
# typeInfo = BeautifulSoup(str(type), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
# print(typeInfo)
# 装修
# typeSubInfo = BeautifulSoup(str(type), 'lxml').find(name="div", attrs={"class": "subInfo"}).text
# print(typeSubInfo)
# 面积
area = soup.find(name="div", attrs={"class": "area"})
# areaInfo = BeautifulSoup(str(area), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
# print(areaInfo)
# 楼房信息
areaSubInfo = BeautifulSoup(str(area), 'lxml').find(name="div", attrs={"class": "subInfo noHidden"}).text
houseInfoDict['楼房信息'] = areaSubInfo
# print(areaSubInfo)
# 小区
community = soup.find(name="div", attrs={"class": "communityName"})
communityInfo = BeautifulSoup(str(community), 'lxml').find(name="a", attrs={"class": "info"}).text
houseInfoDict['小区'] = communityInfo
# print(communityInfo)
# 所属区县
areaDistrict = soup.find(name="div", attrs={"class": "areaName"})
areaDistrictInfo = BeautifulSoup(str(areaDistrict), 'lxml').find_all(name="a")[0].text
houseInfoDict['所属区县'] = areaDistrictInfo
# print(areaDistrictInfo)
# 基本属性
base = soup.find(name="div", attrs={"class": "base"})
baseInfo = BeautifulSoup(str(base), 'lxml').find_all(name="li")
for li in baseInfo:
key = li.text[:4]
houseInfoDict[key] = li.text[4:]
# print(li.text[4:])
# 交易属性
transaction = soup.find(name="div", attrs={"class": "transaction"})
transactionInfo = BeautifulSoup(str(transaction), 'lxml').find_all(name="li")
for li in transactionInfo:
liText = li.text.strip("\r").strip("\n").strip('\r').strip('\n').replace('\n', '').replace(' ', '')
key = liText[:4]
houseInfoDict[key] = liText[4:]
# print(li.text[4:])
# 户型分间
layout = soup.find(name="div", attrs={"class": "layout"})
roomRow = BeautifulSoup(str(layout), 'lxml').find_all(name="div", attrs={"class": "row"})
# print(roomRow)
houseInfoDict['户型分间'] = {}
for row in roomRow:
# key = roomRow[0]
col = BeautifulSoup(str(row), 'lxml').find_all(name="div", attrs={"class": "col"})
for i in range(1, len(col)):
key = col[0].text
if i == 1:
houseInfoDict['户型分间'][key+"面积"] = col[1].text
elif i == 2:
houseInfoDict['户型分间'][key + "朝向"] = col[2].text
elif i == 3:
houseInfoDict['户型分间'][key + "窗型"] = col[3].text
其中,简要信息、基本信息和户型分间三部分数据均可以使用BeatifulSoup进行获取,但最后一部分小区信息简介无法直接获取,这部分数据是经过json封装渲染显示的,因此我们需要到NetWork中查找。 F12后,我们在NetWork中选择Fetch/XHR类型,找到了这一部分的数据,进入Headers查看Request请求,我们发现它的请求链接为:
RequestURL:https://sy.lianjia.com/ershoufang/housestat?hid=102105464046&rid=3120050194861778
其中hid为房屋id,rid为小区id,因此我们需要拿到这两个id。
代码如下:
# 小区简介
rid = soup.find(name="div", attrs={"id": "framesdk"}).get("data-resblock-id")
# print(rid)
houseRecord = soup.find(name="div", attrs={"class": "houseRecord"})
hid = BeautifulSoup(str(houseRecord), 'lxml').find(name="span", attrs={"class": "info"}).text[:-2]
# print(hid)
# 这里是沈阳市的二手房数据,若需要获取其他,更换链接前缀
xiaoquInfoUrl = 'https://sy.lianjia.com/ershoufang/housestat?hid=' + str(hid) + '&rid=' + str(rid)
# print(xiaoquInfoUrl)
xiaoquInfo = json.loads(getPage(xiaoquInfoUrl))
# print(xiaoquInfo)
# print(xiaoquInfo['data']['resblockCard'])
# buildYear
buildYear = xiaoquInfo['data']['resblockCard']['buildYear']
buildNum = xiaoquInfo['data']['resblockCard']['buildNum']
unitPrice = xiaoquInfo['data']['resblockCard']['unitPrice']
houseInfoDict['小区简介'] = {}
houseInfoDict['小区简介']['小区建造年份'] = buildYear
houseInfoDict['小区简介']['楼栋总数'] = buildNum
houseInfoDict['小区简介']['小区均价'] = unitPrice
try:
saveData(houseInfoDict)
except Exception as e:
print(e)
# print(houseInfoDict)
获取二手房详细数据的全部代码如下:
def getHouseContent(url):
houseInfoDict = {}
pageText = getPage(url)
soup = BeautifulSoup(pageText, 'lxml')
# 一些数据重复,因此将重复的部分注释了
# 简要信息
# 总价
price = soup.find(name="span", attrs={"class": "total"}).text
houseInfoDict['总价'] = price
# print(price)
# 单位价格
unitPrice = soup.find(name="span", attrs={"class": "unitPriceValue"}).text
houseInfoDict['单位价格'] = unitPrice
# print(unitPrice)
# 户型
# room = soup.find(name="div", attrs={"class": "room"})
# roomInfo = BeautifulSoup(str(room), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
# print(roomInfo)
# 楼层
# roomSubInfo = BeautifulSoup(str(room), 'lxml').find(name="div", attrs={"class": "subInfo"}).text
# print(roomSubInfo)
# 朝向
# type = soup.find(name="div", attrs={"class": "type"})
# typeInfo = BeautifulSoup(str(type), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
# print(typeInfo)
# 装修
# typeSubInfo = BeautifulSoup(str(type), 'lxml').find(name="div", attrs={"class": "subInfo"}).text
# print(typeSubInfo)
# 面积
area = soup.find(name="div", attrs={"class": "area"})
# areaInfo = BeautifulSoup(str(area), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
# print(areaInfo)
# 楼房信息
areaSubInfo = BeautifulSoup(str(area), 'lxml').find(name="div", attrs={"class": "subInfo noHidden"}).text
houseInfoDict['楼房信息'] = areaSubInfo
# print(areaSubInfo)
# 小区
community = soup.find(name="div", attrs={"class": "communityName"})
communityInfo = BeautifulSoup(str(community), 'lxml').find(name="a", attrs={"class": "info"}).text
houseInfoDict['小区'] = communityInfo
# print(communityInfo)
# 所属区县
areaDistrict = soup.find(name="div", attrs={"class": "areaName"})
areaDistrictInfo = BeautifulSoup(str(areaDistrict), 'lxml').find_all(name="a")[0].text
houseInfoDict['所属区县'] = areaDistrictInfo
# print(areaDistrictInfo)
# 基本属性
base = soup.find(name="div", attrs={"class": "base"})
baseInfo = BeautifulSoup(str(base), 'lxml').find_all(name="li")
for li in baseInfo:
key = li.text[:4]
houseInfoDict[key] = li.text[4:]
# print(li.text[4:])
# 交易属性
transaction = soup.find(name="div", attrs={"class": "transaction"})
transactionInfo = BeautifulSoup(str(transaction), 'lxml').find_all(name="li")
for li in transactionInfo:
liText = li.text.strip("\r").strip("\n").strip('\r').strip('\n').replace('\n', '').replace(' ', '')
key = liText[:4]
houseInfoDict[key] = liText[4:]
# print(li.text[4:])
# 户型分间
layout = soup.find(name="div", attrs={"class": "layout"})
roomRow = BeautifulSoup(str(layout), 'lxml').find_all(name="div", attrs={"class": "row"})
# print(roomRow)
houseInfoDict['户型分间'] = {}
for row in roomRow:
# key = roomRow[0]
col = BeautifulSoup(str(row), 'lxml').find_all(name="div", attrs={"class": "col"})
for i in range(1, len(col)):
key = col[0].text
if i == 1:
houseInfoDict['户型分间'][key+"面积"] = col[1].text
elif i == 2:
houseInfoDict['户型分间'][key + "朝向"] = col[2].text
elif i == 3:
houseInfoDict['户型分间'][key + "窗型"] = col[3].text
# 小区简介
rid = soup.find(name="div", attrs={"id": "framesdk"}).get("data-resblock-id")
# print(rid)
houseRecord = soup.find(name="div", attrs={"class": "houseRecord"})
hid = BeautifulSoup(str(houseRecord), 'lxml').find(name="span", attrs={"class": "info"}).text[:-2]
# print(hid)
# 这里是沈阳市的二手房数据,若需要获取其他,更换链接前缀
xiaoquInfoUrl = 'https://sy.lianjia.com/ershoufang/housestat?hid=' + str(hid) + '&rid=' + str(rid)
# print(xiaoquInfoUrl)
xiaoquInfo = json.loads(getPage(xiaoquInfoUrl))
# print(xiaoquInfo)
# print(xiaoquInfo['data']['resblockCard'])
# buildYear
buildYear = xiaoquInfo['data']['resblockCard']['buildYear']
buildNum = xiaoquInfo['data']['resblockCard']['buildNum']
unitPrice = xiaoquInfo['data']['resblockCard']['unitPrice']
houseInfoDict['小区简介'] = {}
houseInfoDict['小区简介']['小区建造年份'] = buildYear
houseInfoDict['小区简介']['楼栋总数'] = buildNum
houseInfoDict['小区简介']['小区均价'] = unitPrice
try:
saveData(houseInfoDict)
except Exception as e:
print(e)
# print(houseInfoDict
在获取的过程中,对于不同的二手房,数据维度并不是完全一致的,因此,我们在获取的时候,将数据存入MongoDN非关系型数据库中。 代码如下:
MONGO_URL = 'localhost'
# 库名
MONGO_DB = 'lianjia'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
def saveData(houseDict):
# 这里是沈阳市的二手房数据,若需要获取其他,更换表名db[’tableName‘]
db['shenyang'].save(houseDict)
print("存入数据库"+str(len(houseDict)) + "条数据!")
03
获取该城市所有的数据
我们将某城市二手房页面拉到最底,似乎只能显示100页,每页有30条二手房售卖链接。 对应每页的链接,可以发现如下规律:
https://sy.lianjia.com/ershoufang/pg1/
https://sy.lianjia.com/ershoufang/pg2/
https://sy.lianjia.com/ershoufang/pg3/
因此,我们只需要写个循环,拼接一下每页的url即可.
代码如下:
for i in range(1, 101):
# 这里是沈阳市的二手房数据,若需要获取其他,更换链接前缀
url = 'https://sy.lianjia.com/ershoufang/pg' + str(i)
houseIdList = getHouseId(url)
print("第" + str(i) + "页")
for j in range(len(houseIdList)):
print(j)
try:
getHouseContent(houseIdList[j])
except Exception as e:
print(e)
04
全部代码
import requests
from bs4 import BeautifulSoup
import json
import pymongo
MONGO_URL = 'localhost'
MONGO_DB = 'lianjia'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
def getPage(url):
try:
res = requests.get(url)
if res.status_code == 200:
# print(res.text)
# print(res.encoding)
return res.text
except Exception as e:
print(e)
def getHouseId(url):
pageText = getPage(url)
soup = BeautifulSoup(pageText, 'lxml')
soupContent = soup.find_all(name="a", attrs={"data-el": "ershoufang"})
houseIdSet = set()
for a in soupContent:
houseIdSet.add(a.get("href"))
# print(a.get("href"))
return list(houseIdSet)
def saveData(houseDict):
# 这里是沈阳市的二手房数据,若需要获取其他,更换表名db[’tableName‘]
db['shenyang'].save(houseDict)
print("存入数据库"+str(len(houseDict)) + "条数据!")
def getHouseContent(url):
houseInfoDict = {}
pageText = getPage(url)
soup = BeautifulSoup(pageText, 'lxml')
# 一些数据重复,因此将重复的部分注释了
# 简要信息
# 总价
price = soup.find(name="span", attrs={"class": "total"}).text
houseInfoDict['总价'] = price
# print(price)
# 单位价格
unitPrice = soup.find(name="span", attrs={"class": "unitPriceValue"}).text
houseInfoDict['单位价格'] = unitPrice
# print(unitPrice)
# 户型
# room = soup.find(name="div", attrs={"class": "room"})
# roomInfo = BeautifulSoup(str(room), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
# print(roomInfo)
# 楼层
# roomSubInfo = BeautifulSoup(str(room), 'lxml').find(name="div", attrs={"class": "subInfo"}).text
# print(roomSubInfo)
# 朝向
# type = soup.find(name="div", attrs={"class": "type"})
# typeInfo = BeautifulSoup(str(type), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
# print(typeInfo)
# 装修
# typeSubInfo = BeautifulSoup(str(type), 'lxml').find(name="div", attrs={"class": "subInfo"}).text
# print(typeSubInfo)
# 面积
area = soup.find(name="div", attrs={"class": "area"})
# areaInfo = BeautifulSoup(str(area), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
# print(areaInfo)
# 楼房信息
areaSubInfo = BeautifulSoup(str(area), 'lxml').find(name="div", attrs={"class": "subInfo noHidden"}).text
houseInfoDict['楼房信息'] = areaSubInfo
# print(areaSubInfo)
# 小区
community = soup.find(name="div", attrs={"class": "communityName"})
communityInfo = BeautifulSoup(str(community), 'lxml').find(name="a", attrs={"class": "info"}).text
houseInfoDict['小区'] = communityInfo
# print(communityInfo)
# 所属区县
areaDistrict = soup.find(name="div", attrs={"class": "areaName"})
areaDistrictInfo = BeautifulSoup(str(areaDistrict), 'lxml').find_all(name="a")[0].text
houseInfoDict['所属区县'] = areaDistrictInfo
# print(areaDistrictInfo)
# 基本属性
base = soup.find(name="div", attrs={"class": "base"})
baseInfo = BeautifulSoup(str(base), 'lxml').find_all(name="li")
for li in baseInfo:
key = li.text[:4]
houseInfoDict[key] = li.text[4:]
# print(li.text[4:])
# 交易属性
transaction = soup.find(name="div", attrs={"class": "transaction"})
transactionInfo = BeautifulSoup(str(transaction), 'lxml').find_all(name="li")
for li in transactionInfo:
liText = li.text.strip("\r").strip("\n").strip('\r').strip('\n').replace('\n', '').replace(' ', '')
key = liText[:4]
houseInfoDict[key] = liText[4:]
# print(li.text[4:])
# 户型分间
layout = soup.find(name="div", attrs={"class": "layout"})
roomRow = BeautifulSoup(str(layout), 'lxml').find_all(name="div", attrs={"class": "row"})
# print(roomRow)
houseInfoDict['户型分间'] = {}
for row in roomRow:
# key = roomRow[0]
col = BeautifulSoup(str(row), 'lxml').find_all(name="div", attrs={"class": "col"})
for i in range(1, len(col)):
key = col[0].text
if i == 1:
houseInfoDict['户型分间'][key+"面积"] = col[1].text
elif i == 2:
houseInfoDict['户型分间'][key + "朝向"] = col[2].text
elif i == 3:
houseInfoDict['户型分间'][key + "窗型"] = col[3].text
# 小区简介
rid = soup.find(name="div", attrs={"id": "framesdk"}).get("data-resblock-id")
# print(rid)
houseRecord = soup.find(name="div", attrs={"class": "houseRecord"})
hid = BeautifulSoup(str(houseRecord), 'lxml').find(name="span", attrs={"class": "info"}).text[:-2]
# print(hid)
# 这里是沈阳市的二手房数据,若需要获取其他,更换链接前缀
xiaoquInfoUrl = 'https://sy.lianjia.com/ershoufang/housestat?hid=' + str(hid) + '&rid=' + str(rid)
# print(xiaoquInfoUrl)
xiaoquInfo = json.loads(getPage(xiaoquInfoUrl))
# print(xiaoquInfo)
# print(xiaoquInfo['data']['resblockCard'])
# buildYear
buildYear = xiaoquInfo['data']['resblockCard']['buildYear']
buildNum = xiaoquInfo['data']['resblockCard']['buildNum']
unitPrice = xiaoquInfo['data']['resblockCard']['unitPrice']
houseInfoDict['小区简介'] = {}
houseInfoDict['小区简介']['小区建造年份'] = buildYear
houseInfoDict['小区简介']['楼栋总数'] = buildNum
houseInfoDict['小区简介']['小区均价'] = unitPrice
try:
saveData(houseInfoDict)
except Exception as e:
print(e)
# print(houseInfoDict)
for i in range(1, 101):
# 这里是沈阳市的二手房数据,若需要获取其他,更换链接前缀
url = 'https://sy.lianjia.com/ershoufang/pg' + str(i)
houseIdList = getHouseId(url)
print("第" + str(i) + "页")
for j in range(len(houseIdList)):
print(j)
try:
getHouseContent(houseIdList[j])
except Exception as e:
print(e)
05
总结
数据获取先写到这里,后续出数据处理、可视化以及二手房价格预测模型部分。
· END ·
一个有理想、有热血的吉吉