...
 
Commits (2)
......@@ -9,9 +9,12 @@
#=======================================
# origin data path | 抓取下来的原始数据路径
originDataPath = '_1_origin-data/saled.json'
# 在售房子信息
salingDataPath = '_1_origin-data/saling.json'
# clean data path | 干净数据
cleanDataPath = '_2_clean-data/data.json'
salingCleanDataPath = '_2_clean-data/saling.json'
# train data path | 训练的数据
trainDataPath = '_3_0_train-data/data.json'
......@@ -22,6 +25,18 @@ testDataPath = '_3_1_test-data/data.json'
# predice data path | 评估数据
predictDataPath = '_3_2_predict-data/data.json'
#=======================================
# 选取的特征
#=======================================
keys = [
'buildYear',
'districtName',
'regionName',
'bedroomNum',
'square',
'decoration',
'price',
]
#=======================================
......
......@@ -2,8 +2,10 @@
import pandas as pd
import _0_config
#print(_0_config.originDataPath)
#============================
# 对训练和测试数据进行转换清洗
#============================
data = pd.read_json(_0_config.originDataPath)
data = data['data']
......@@ -14,37 +16,29 @@ for index, row in data.iteritems():
df = pd.DataFrame(jsonData)
tplData = df['tplData']
rows = []
for i, row in tplData.iteritems():
if 'list' in row['sold_house']:
rows = rows + row['sold_house']['list']
dfData = pd.DataFrame(rows)
df = dfData.reset_index()
print(df.describe)
exit()
#df.to_csv(path_or_buf=_0_config.cleanDataPath + '_')
#exit()
# 增加整理一些字段
dfData['decoration'] = dfData['decorateType']
#====roomNum
dfData['bedroomNum'] = dfData['roomNum'].replace(to_replace='[^\d].+', value='', regex=True)
# delete no use columns
dfData.drop([
'viewUrl', 'valid', 'unitPrice',
'type', 'title', 'sign_time',
'serviceTitle', 'resblockId', 'regionUrl',
'imgSrc', 'houseId', 'communityUrl',
'cityName', 'districtUrl', 'totalFloor',
'floorStat', 'hbtName', 'orientation',
'isFeel', 'isTrendDown', 'orientation',
'schoolName', 'se',
'subwayInfo', 'tags', 'signTime',
'soldFeel', 'resblockName'
], 1, inplace=True)
#=====signTime
'''
dfData['signTime'] = pd.to_numeric(dfData['signTime'].replace(to_replace='\.', value='', regex=True), errors='coerce')
'''
willDeleteKeys = []
for column in dfData.columns:
if column in _0_config.keys:
continue
else:
willDeleteKeys.append(column)
dfData.drop(willDeleteKeys, 1, inplace=True)
#=====buildYear 2011
# filter build year is zero
......@@ -71,45 +65,39 @@ dfData = dfData[
(dfData['districtName'] == '渝中') |
(dfData['districtName'] == '渝北')
]
#九龙坡(1)/南岸(2)/...
#====九龙坡(1)/南岸(2)/...
temp = dfData.groupby(['districtName'])
keys = list(temp.groups.keys())
for i, val in enumerate(keys):
dfData.loc[dfData['districtName'] == val,['districtName']] = i + 1
#====regionName 三峡广场(1)/上清寺(2)/...
temp = dfData.groupby(['regionName'])
keys = list(temp.groups.keys())
for i, val in enumerate(keys):
#print(str(i + 1)+ ' ' + val)
dfData.loc[dfData['regionName'] == val,['regionName']] = i + 1
#====resblockName V8小区(1)/morning公馆(2)/一城龙洲(3)/...
'''
temp = dfData.groupby(['resblockName'])
keys = list(temp.groups.keys())
for i, val in enumerate(keys):
#print(str(i + 1)+ ' ' + val)
dfData.loc[dfData['resblockName'] == val,['resblockName']] = i + 1
'''
#====roomNum
dfData['bedroomNum'] = dfData['roomNum'].replace(to_replace='[^\d].+', value='', regex=True)
dfData.drop(['roomNum'], 1, inplace=True)
# nothing
#====square 113
#====square 113
# nothing
#====price
# nothing
# write to file
#print(dfData.tail(10))
df = dfData.reset_index()
df.to_json(path_or_buf=_0_config.cleanDataPath, orient='records')
print('origin to clean successfully')
#print(dfData.loc[0])
#============================
# 对需要评估的房子进行转换清洗
#============================
data = pd.read_json(_0_config.salingDataPath)
df = pd.DataFrame(data)
df = df.reset_index()
df.to_json(path_or_buf=_0_config.salingCleanDataPath, orient='records')
print('origin to clean successfully')
......@@ -18,7 +18,7 @@ log(){
#==================================
# 数据抓取开关
SPIDER_ON=false
if [ "SPIDER_ON" = true ]; then
if [ "$SPIDER_ON" = true ]; then
cd lianjia
./spider.sh
cd ..
......@@ -28,6 +28,7 @@ fi
# 将数据清洗和转换
#==================================
./_1_origin-to-clean.py
exit
#==================================
......
#!/usr/bin/env sh
# 抓取已销售房子信息
./my-run-sh/saled.sh
#./my-run-sh/saled.sh
# 抓取在销售房子信息
./my-run-sh/saling.sh
......