Commit 441bde99 authored by Shawn You's avatar Shawn You 💬

update

parent 29ddc1e9
......@@ -25,6 +25,9 @@ testDataPath = '_3_1_test-data/data.json'
# predice data path | 评估数据
predictDataPath = '_3_2_predict-data/data.json'
# 评估数据结果
resultDataPath = '_5_evaluation-result/data.json'
#=======================================
# 选取的特征
#=======================================
......
#!/usr/bin/env python
import pandas as pd
import _0_config
import numpy as np
#============================
......@@ -29,7 +30,9 @@ df = dfData.reset_index()
# 增加整理一些字段
dfData['decoration'] = dfData['decorateType']
#====roomNum
dfData['bedroomNum'] = dfData['roomNum'].replace(to_replace='[^\d].+', value='', regex=True)
dfData['bedroomNum'] = pd.to_numeric(dfData['roomNum'].replace(to_replace='[^\d].+', value='', regex=True))
# 处理异常数据
dfData = dfData[np.isfinite(dfData['bedroomNum'])]
# delete no use columns
willDeleteKeys = []
......@@ -98,6 +101,7 @@ df.to_json(path_or_buf=_0_config.cleanDataPath, orient='records')
#============================
data = pd.read_json(_0_config.salingDataPath)
df = pd.DataFrame(data)
df.drop(['url'], 1, inplace=True)
df = df.reset_index()
df.to_json(path_or_buf=_0_config.salingCleanDataPath, orient='records')
print('origin to clean successfully')
......@@ -29,4 +29,20 @@ trainData.to_json(path_or_buf= _0_config.trainDataPath)
testData = df[trainCount:totalRowCount]
testData.to_json(path_or_buf = _0_config.testDataPath)
#============================
# 对需要评估的房子进行转换清洗
#============================
data = pd.read_json(_0_config.salingCleanDataPath)
df = pd.DataFrame(data)
transedData = []
for index, row in df.iterrows():
row['decoration'] = _0_config.decorations[row['decoration']]
row['districtName'] = _0_config.districtName[row['districtName']]
row['regionName'] = _0_config.regionName[row['regionName']]
transedData.append(row)
df = pd.DataFrame(transedData)
df = df.reset_index()
df.to_json(path_or_buf=_0_config.predictDataPath)
print('train and test data ready!')
......@@ -29,14 +29,7 @@ dftest = dftest.drop(['price'], 1, inplace=False)
#================
# 评估数据
#================
predictJson = []
with open(_0_config.predictDataPath) as json_data:
predictJson = json.load(json_data)
predictJson['decoration']['1'] = _0_config.decorations[predictJson['decoration']['1']]
predictJson['districtName']['1'] = _0_config.districtName[predictJson['districtName']['1']]
predictJson['regionName']['1'] = _0_config.regionName[predictJson['regionName']['1']]
#predictJson['resblockName']['1'] = _0_config.resblockName[predictJson['resblockName']['1']]
dfpredict = pd.DataFrame.from_dict(predictJson);
dfpredict = pd.read_json(path_or_buf=_0_config.predictDataPath);
#================
......@@ -76,8 +69,17 @@ for column in dftrain.columns:
#0.0000002 3598.0186
#================
opt = tf.train.GradientDescentOptimizer(learning_rate=0.000000109)
#estimator = tf.contrib.learn.LinearRegressor(feature_columns=features, optimizer=opt, model_dir="./model/")
#opt = tf.train.GradientDescentOptimizer(learning_rate=0.000000109)
opt = tf.train.GradientDescentOptimizer(learning_rate=0.0000000109)
#===线性回归
estimator = tf.contrib.learn.LinearRegressor(
feature_columns=features,
optimizer=opt,
#model_dir="./model/"
)
#===神经网络回归
'''
estimator = tf.contrib.learn.DNNRegressor(
feature_columns=features,
hidden_units=[50, 20, 50, 10],
......@@ -85,11 +87,8 @@ estimator = tf.contrib.learn.DNNRegressor(
optimizer=opt,
#model_dir="./saved_model/"
)
'''
# TensorFlow provides many helper methods to read and set up data sets.
# Here we use two data sets: one for training and one for evaluation
# We have to tell the function how many batches
# of data (num_epochs) we want and how big each batch should be.
xs_train = {}
xs_eval = {}
xs_predict = {}
......@@ -113,15 +112,15 @@ eval_input_fn = tf.contrib.learn.io.numpy_input_fn(
num_epochs=1000)
print('start fit')
estimator.fit(input_fn=input_fn, steps=100000)
estimator.fit(input_fn=input_fn, steps=10000)
print('end fit')
print('end train loss')
train_loss = estimator.evaluate(input_fn=input_fn)
#train_loss = estimator.evaluate(input_fn=input_fn)
eval_loss = estimator.evaluate(input_fn=eval_input_fn)
print("train loss: %r"% train_loss)
print("eval loss: %r"% eval_loss)
#eval_loss = estimator.evaluate(input_fn=eval_input_fn)
#print("train loss: %r"% train_loss)
#print("eval loss: %r"% eval_loss)
print ("Local current time :", time.asctime( time.localtime(time.time()) ))
......@@ -129,6 +128,15 @@ print ("Local current time :", time.asctime( time.localtime(time.time()) ))
# 评估在售房价
#================
predictResult = list(estimator.predict(xs_predict))
print("predictL%r"% predictResult)
df = pd.DataFrame(predictResult)
df = df.reset_index()
df.to_json(path_or_buf=_0_config.resultDataPath)
#print("predictL%r"% predictResult)
exit
#=========
# 判断是否有nan
#print(np.any(np.isnan(xs_train[column])))
#======
......@@ -28,7 +28,6 @@ fi
# 将数据清洗和转换
#==================================
./_1_origin-to-clean.py
exit
#==================================
......
......@@ -24,7 +24,7 @@ class Myitem(scrapy.Item):
class SalingSpider(scrapy.Spider):
name = 'saling'
start_urls = ['https://cq.lianjia.com/ershoufang/']
start_urls = ['https://cq.lianjia.com/ershoufang/pg2/']
def parse(self, response):
for href in response.css('.sellListContent > li .img').xpath('@href').extract():
......@@ -42,7 +42,8 @@ class SalingSpider(scrapy.Spider):
item['buildYear'] = re.findall(r"\d+\.?\d*",extract_with_css('.houseInfo .area .subInfo::text'))[0]
item['decoration'] = extract_with_css('.houseInfo .type .subInfo::text')
item['districtName'] = extract_with_css('.aroundInfo .info a::text')
item['regionName'] = extract_with_css('.aroundInfo .info a::text')
#item['regionName'] = extract_with_css('.aroundInfo .info a::last::text')
item['regionName'] = response.css('.aroundInfo .info a::text').extract()[1]
item['bedroomNum'] = re.findall(r"\d+\.?\d*",extract_with_css('.houseInfo .room .mainInfo::text'))[0]
item['square'] = re.findall(r"\d+\.?\d*",extract_with_css('.houseInfo .area .mainInfo::text'))[0]
item['price'] = extract_with_css('.overview .price .total::text')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment