...
 
Commits (25)
#This file is in response to an email from Jonathan Silverstein at the City of Chicago.
onemil <-
# result = tryCatch(
dbGetQuery(CCAODATA,
paste0("
SELECT * FROM HEAD AS H
INNER JOIN
IDORSALES AS I
ON H.HD_PIN=I.PIN AND H.TAX_YEAR = H.TAX_YEAR
AND H.TAX_YEAR=YEAR(I.RECORDED_DATE)
WHERE I.DEED_TYPE NOT IN ('Q', 'E', 'B')
"))
onemil$SALE_PRICE <- as.numeric(onemil$SALE_PRICE)
df1 <- onemil %>% filter(!HD_CLASS %in% c(211, 212)) %>% filter(SALE_PRICE >= 1000000)
......@@ -283,17 +283,31 @@ assmntdata <- e.recode$num(assmntdata, c("TOWN_CODE", "NBHD", "TRACTCE"))
assmntdata$NBHD_mapping <- assmntdata$NBHD
# Variables for legacy model
assmntdata <- legacy_recodes(assmntdata)
# generate combined neighborhood/towncode variable specific to modeling groups to avoid having to use interaction terms
assmntdata$town_nbhd <- as.factor(paste0(assmntdata$TOWN_CODE, assmntdata$NBHD))
assmntdata$modeling_group <- as.factor(assmntdata$modeling_group)
for (i in levels(assmntdata$modeling_group)){
eval(parse(text = paste0("assmntdata$town_nbhd_", i, " <- assmntdata$town_nbhd")))
eval(parse(text = paste0("assmntdata$town_nbhd_", i, "[assmntdata$assmnting_group!='", i, "']<-NA")))
}
# reassign neighborhoods that lacked sales in modeldata construction
assmntdata <- geo_recodes(assmntdata)
for (i in levels(assmntdata$modeling_group)){
eval(parse(text = paste0("assmntdata$town_nbhd_", i, "<-droplevels(assmntdata$town_nbhd_", i, ")")))
}
# Want as factor once geographic recodes are completed
assmntdata <- e.recode$fac(assmntdata, c("TOWN_CODE", "NBHD", "TRACTCE"))
# Variables for legacy model
assmntdata <- legacy_recodes(assmntdata)
#Predict price tercile using ordered logic model
assmntdata$price_tercile <- NA
assmntdata$modeling_group <- as.factor(assmntdata$modeling_group)
for(m in levels(assmntdata$modeling_group)){
model <- paste("TERC_MOD_", m, sep = "")
err <- try(predict(object=eval(parse(text=model)), newdata=subset(assmntdata, modeling_group==m), na.action = na.exclude))
......@@ -367,18 +381,14 @@ if (nbhd_experiment == 'on'){
assmntdata <- nbhd_recode_func(assmntdata, recoded_pins, 171, 80)
}
# generate combined neighborhood/towncode variable to avoid having to use interaction terms
assmntdata$town_nbhd <- as.factor(paste0(assmntdata$TOWN_CODE, assmntdata$NBHD))
for (i in 1:length(levels(assmntdata$town_nbhd))){
levels(assmntdata$town_nbhd)[i] <- paste0("TOWN ", substr(levels(assmntdata$town_nbhd)[i], 1, 2), " | NBHD ", substr(levels(assmntdata$town_nbhd)[i], 3, 5))
}
# problematic pins excluded, usually due to missing data
assmntdata <- subset(assmntdata, !(assmntdata$PIN %in% read.xlsx("ccao_dictionary_draft.xlsx", sheetName = "pin_exclusions", stringsAsFactors=FALSE)[,1]))
summary(as.factor(subset(assmntdata, filter_1>=1)$modeling_group))
# Save ----
save(dirs, integrity_checks
, modeldata, assmntdata, LOCF_MODEL
, recoded_pins
, modeldata, assmntdata, LOCF_MODEL, recoded_pins, tercile_models
, file=destfile)
save(assmntdata, integrity_checks, LOCF_MODEL, file=paste0(dirs$data, "assmntdata.Rda"))
......@@ -327,7 +327,19 @@ modeldata <- e.recode$num(modeldata, c("TOWN_CODE", "NBHD", "TRACTCE"))
modeldata$NBHD_mapping <- modeldata$NBHD
modeldata <- geo_recodes(modeldata)
# Variables for legacy model
modeldata <- legacy_recodes(modeldata)
# generate combined neighborhood/towncode variable specific to modeling groups to avoid having to use interaction terms
modeldata$town_nbhd <- as.factor(paste0(modeldata$TOWN_CODE, modeldata$NBHD))
modeldata$modeling_group <- as.factor(modeldata$modeling_group)
for (i in levels(modeldata$modeling_group)){
eval(parse(text = paste0("modeldata$town_nbhd_", i, " <- modeldata$town_nbhd")))
eval(parse(text = paste0("modeldata$town_nbhd_", i, "[modeldata$modeling_group!='", i, "']<-NA")))
eval(parse(text = paste0("modeldata$town_nbhd_", i, "<-droplevels(modeldata$town_nbhd_", i, ")")))
}
# Want as factor once geographic recodes are completed
modeldata <- e.recode$fac(modeldata, c("TOWN_CODE", "NBHD", "TRACTCE"))
......@@ -335,7 +347,7 @@ modeldata <- e.recode$fac(modeldata, c("TOWN_CODE", "NBHD", "TRACTCE"))
#Add price tercile variable
modeldata$price_tercile <- NA
tercile_info <- NULL
modeldata$modeling_group <- as.factor(modeldata$modeling_group)
for(m in levels(modeldata$modeling_group)){
onethird <- quantile(modeldata$sale_price[modeldata$modeling_group == m &
!is.na(modeldata$sale_price) &
......@@ -360,9 +372,10 @@ m_tercile_integ <- modeldata[,-10] %>%
#create an ordered logit model to predict price tercile in unsold data
#Specify model parameters:
sfmod <- price_tercile~1+BLDG_SF+HD_SF+FBATH+BEDS+ROOMS+EXT_WALL+LOCF1+PORCH+GARAGE_IND+AIR+HEAT
mfmod <- price_tercile~1+BLDG_SF+LOCF1+FBATH+BEDS+EXT_WALL
#ncmod <- price_tercile~1+HD_SF+n_units
tercile_models <- read.xlsx("ccao_dictionary_draft.xlsx", sheetName = "price_tercile_model", stringsAsFactors=FALSE)
sfmod <- subset(tercile_models$sf_probit_model, tercile_models$township == target_township)
mfmod <- subset(tercile_models$mf_probit_model, tercile_models$township == target_township)
#ncmod <- subset(tercile_models$nchars_probit_model, tercile_models$township == target_township)
err <- try(TERC_MOD_SF <- polr(sfmod, data = modeldata, subset= modeldata$modeling_group=="SF", method = "probit", na.action = na.omit))
if(class(err)!="try-error"){
......@@ -464,9 +477,6 @@ rm(check, msg, ptacc, pt1acc, pt2acc, pt3acc, err, onethird, twothird)
modeldata$price_tercile_pred <- NULL
# Variables for legacy model
modeldata <- legacy_recodes(modeldata)
# legacy weights Dave uses
for(t in levels(modeldata$TOWN_CODE)){
for(n in levels(modeldata$NBHD)){
......@@ -484,11 +494,8 @@ if (nbhd_experiment == 'on'){
recoded_pins <- NA
}
# generate combined neighborhood/towncode variable to avoid having to use interaction terms
modeldata$town_nbhd <- as.factor(paste0(modeldata$TOWN_CODE, modeldata$NBHD))
for (i in 1:length(levels(modeldata$town_nbhd))){
levels(modeldata$town_nbhd)[i] <- paste0("TOWN ", substr(levels(modeldata$town_nbhd)[i], 1, 2), " | NBHD ", substr(levels(modeldata$town_nbhd)[i], 3, 5))
}
# problematic pins excluded, usually due to missing data
modeldata <- subset(modeldata, !(modeldata$PIN %in% read.xlsx("ccao_dictionary_draft.xlsx", sheetName = "pin_exclusions", stringsAsFactors=FALSE)[,1]))
# Labeling ----
# TO DO
......@@ -504,6 +511,5 @@ integ_check_f(length(unique(modeldata[,c("DOC_NO")]))==nrow(modeldata),
# Save data ----
save(dirs, integrity_checks
, modeldata, LOCF_MODEL
, recoded_pins
, modeldata, LOCF_MODEL, recoded_pins, tercile_models
, file=destfile)
\ No newline at end of file
......@@ -49,7 +49,7 @@ modeldata_trim_levels <- list()
# to send to desktop review
models <- subset(
read.xlsx("ccao_dictionary_draft.xlsx", sheetName = "models", stringsAsFactors=FALSE)
, estimation_method=="OLS"|estimation_method =="Quantile" | estimation_method=="GBM"
, estimation_method %in% c("OLS", "Quantile")
# & modeling_group=="NCHARS"
)
# Modeling loop ----
......@@ -115,7 +115,6 @@ for(m in 1:nrow(models)){ for(i in 1:filter_iters){
,distribution="gaussian"
,interaction.depth=2
,n.trees=ntree
,cv.folds = 1
,data=subset(modeldata, filter_1>=filter_setting_1 & modeling_group==models$modeling_group[m])))
}
......@@ -197,30 +196,6 @@ for(m in 1:nrow(models)){ for(i in 1:filter_iters){
}
}
if (models$estimation_method[m]=='GBM' & models$type[m] == 'lin-lin'){
predictions <- NA
TF <- modeldata$modeling_group==models$modeling_group[m] & modeldata$filter_1 >= 1
error <- try(predict(model, newdata=modeldata[TF,], n.trees=ntree))
if (class(error) != "try-error"){
predictions[TF] <- try(predict(model, newdata=modeldata[TF,], n.trees=ntree))
modeldata$fitted_value <- predictions
print("Values predicted")
}
else{
if(filter_setting_1==1){
print(error)
check<- paste0("Able to estimate all models with no trimming?")
msg<- paste0("Bad: failed to predict values for ",models$model_id[m], " at trim setting 1")
integrity_checks <- rbind(integrity_checks, data.frame(check=check, outcome=msg))
print(check); print(msg); rm( msg)
next
}else{
print(paste0("Error predicting values, skipping loop."))
next
}
}
}
rm(predictions)
# Can't have negative predictions
......@@ -348,27 +323,10 @@ for(m in 1:nrow(models)){ for(i in 1:filter_iters){
print(summary(model))
}
}
# Model R-Squared ----
# Want direct comparison for models without explicit R-squareds
# Manually calculate R-squared as squared deviations from preductions?
r <- try(summary(model)$adj.r.squared)
if(is.null(r)|class(r)=="try-error"){r<-NA}
if(models$estimation_method[m]=="OLS"){
n <- nobs(model)
}else if(models$estimation_method[m]=="Quantile"){
n <-length(model2[["residuals"]])
}else if(models$estimation_method[m]=="GBM"){
n <- length(model[["fit"]])
}else{
n<-NA
}
print(paste0(models$model_id[m]," --> "
, "Check n= ", n, " in ", models$modeling_group[m]
, " Adjusted r = ", r
print(paste0(models$model_id[m]," --> "
, "Check n= ", nobs(model), " in ", models$modeling_group[m]
, " Adjusted r = ", round(summary(model)$adj.r.squared,3)
, " COD = ", COD$COD
, " COD NOT BOOTED = ",round(100*sum(abs(subset(modeldata, !is.na(ratio) & filter_1>=1 )$ratio-
median(subset(modeldata, !is.na(ratio) & filter_1>=1)$ratio))) /
......@@ -376,8 +334,17 @@ for(m in 1:nrow(models)){ for(i in 1:filter_iters){
median(subset(modeldata, !is.na(ratio) & filter_1>=1)$ratio)),2)
," PRD = ",COD$PRD
," PRB = ",COD$PRB))
rm(n)
}else{
print(paste0(models$model_id[m]," --> "
, "Check n= ", length(model[["residuals"]]), " in ", models$modeling_group[m]
, " COD = ", COD$COD
, " COD NOT BOOTED = ",round(100*sum(abs(subset(modeldata, !is.na(ratio) & filter_1>=1 )$ratio-
median(subset(modeldata, !is.na(ratio) & filter_1>=1)$ratio))) /
(nrow(subset(modeldata, !is.na(ratio) & filter_1>=1))*
median(subset(modeldata, !is.na(ratio) & filter_1>=1)$ratio)),2)
," PRD = ",COD$PRD
," PRB = ",COD$PRB))
}
if (holdout_switch == "y"){
print("Holdout")
# Holdout ----
......@@ -529,7 +496,13 @@ for(m in 1:nrow(models)){ for(i in 1:filter_iters){
print(paste0(user, " chose to skip holdout"))
}
# Model R-Squared ----
# Want direct comparison for models without explicit R-squareds
# Manually calculate R-squared as squared deviations from preductions?
r <- try(summary(model)$adj.r.squared)
if(is.null(r)|class(r)=="try-error"){r<-NA}
# Save Modeling Results ----
# IQR
p25 <- quantile(subset(modeldata, filter_1>=filter_setting_1)$ratio, c(.25), names = FALSE, na.rm=TRUE)
......@@ -635,7 +608,7 @@ names(modeldata_trim_levels) <- paste(paste(models$model_id,"_",sep=''),models$m
save(dirs, integrity_checks
, modeldata, assmntdata, LOCF_MODEL
, models, modeling_results, modeldata_trim_levels
, recoded_pins
, recoded_pins, tercile_models
, file=destfile)
print(warnings())
\ No newline at end of file
......@@ -426,10 +426,10 @@ nrow(subset(valuationdata, ratio_3>2 & !is.na(ratio_3) & filter_1>=1))
TF <- valuationdata$ratio_3 > 2 & valuationdata$filter_1 >= 1 & !is.na(valuationdata$ratio_3)
valuationdata$fitted_value_4[TF] <- valuationdata$most_recent_sale_price[TF]*2
# Estimated market values cannot be less than 80% of the value from a recent sale
nrow(subset(valuationdata, ratio_3<.8 & !is.na(ratio_3) & filter_1>=1))
TF <- valuationdata$ratio_3 < .8 & valuationdata$filter_1 >= 1 & !is.na(valuationdata$ratio_3)
valuationdata$fitted_value_4[TF] <- valuationdata$most_recent_sale_price[TF]*.8
# Estimated market values cannot be less than 70% of the value from a recent sale
nrow(subset(valuationdata, ratio_3<.7 & !is.na(ratio_3) & filter_1>=1))
TF <- valuationdata$ratio_3 < .7 & valuationdata$filter_1 >= 1 & !is.na(valuationdata$ratio_3)
valuationdata$fitted_value_4[TF] <- valuationdata$most_recent_sale_price[TF]*.7
}else{
valuationdata$fitted_value_4 <- valuationdata$fitted_value_3
......@@ -613,7 +613,7 @@ save(dirs, integrity_checks
, modeldata, assmntdata, valuationdata, LOCF_MODEL
, models, modeling_results
, final_sf_model, final_mf_model, final_nchars_model, best_models
, recoded_pins
, recoded_pins, tercile_models
, file=destfile)
print("Done")
......@@ -8,12 +8,27 @@ FROM [65D] AS D
INNER JOIN
HEAD AS H
ON H.HD_PIN=D.D65_PIN
WHERE D.TAX_YEAR=2019 AND H.TAX_YEAR=2018
AND CONVERT(INT, SUBSTRING(CONVERT(CHARACTER, HD_TOWN),1,2)) IN ('17')
WHERE D.TAX_YEAR=2019 AND H.TAX_YEAR=2018 AND D65_COMMENT IN ('B/R RED 16','B/R RED 17','B/R RED 18')
AND CONVERT(INT, SUBSTRING(CONVERT(CHARACTER, HD_TOWN),1,2)) IN ('", target_township, "')
AND D65_COMMENT NOT IN ('')
GROUP BY D65_EMPID
"))
dbGetQuery(CCAODATA,
paste0("
SELECT COUNT(D65_PIN), D65_EMPID
FROM [65D] AS D
INNER JOIN
HEAD AS H
ON H.HD_PIN=D.D65_PIN
WHERE D.TAX_YEAR=2019 AND H.TAX_YEAR=2018 AND D65_NEW_MV!=D65_EST_MV AND D65_NEW_MV>0
AND CONVERT(INT, SUBSTRING(CONVERT(CHARACTER, HD_TOWN),1,2)) IN ('", target_township, "')
AND D65_COMMENT NOT IN ('')
GROUP BY D65_EMPID
"))
desktop_review_results <-
dbGetQuery(CCAODATA,
paste0("
......@@ -23,7 +38,7 @@ INNER JOIN
HEAD AS H
ON H.HD_PIN=D.D65_PIN
WHERE D.TAX_YEAR=2019 AND H.TAX_YEAR=2018
AND CONVERT(INT, SUBSTRING(CONVERT(CHARACTER, HD_TOWN),1,2)) IN ('17')
AND CONVERT(INT, SUBSTRING(CONVERT(CHARACTER, HD_TOWN),1,2)) IN ('", target_township, "')
AND D65_COMMENT NOT IN ('')
"))
......@@ -53,10 +68,10 @@ valuationdata$fitted_value_7[valuationdata$fitted_value_7==0] <- valuationdata$f
valuationdata$fitted_value_7 <- as.numeric(valuationdata$fitted_value_7)
valuationdata$raio_7 <- valuationdata$fitted_value_7/valuationdata$most_recent_sale_price
save(dirs, integrity_checks
, modeldata, assmntdata, valuationdata, LOCF_MODEL
, models, modeling_results
, final_sf_model, final_mf_model, final_nchars_model, best_models
, recoded_pins, tercile_models
, file=destfile)
This diff is collapsed.
......@@ -36,100 +36,13 @@ for (i in levels(outlier_corrections$NBHD)) {
}
outlier_pins <- c("05072130360000"
, "05063010150000"
, "04122010030000"
, "04122040050000"
, "05083130150000"
, "05172020050000"
, "05064030210000"
, "05213140110000"
, "05174160090000"
, "05213040020000"
, "05211310090000"
, "05074090090000"
, "05074140130000"
, "05214110040000"
, "05211010210000"
, "05072050290000"
, "05071130180000"
, "05071090060000"
, "05073070100000"
, "05201220020000"
, "05182220240000"
, "05201100190000"
, "05181070230000"
, "05201140140000"
, "05201130020000"
, "05202190120000"
, "05173120380000"
, "05213110380000"
, "05174100100000"
, "05292040290000"
outlier_pins <- c("08104080180000"
)
outlier_fields <- list(c("FBATH", "BLDG_SF")
, c("FBATH", "HBATH", "BLDG_SF")
, c("HBATH", "BLDG_SF")
, c("BSMT")
, c("FBATH", "BLDG_SF", "ROOMS")
, c("FBATH", "HBATH", "BLDG_SF")
, c("FBATH", "HBATH", "BLDG_SF")
, c("FBATH", "ROOMS", "BLDG_SF")
, c("FBATH", "ROOMS", "BSMT", "BLDG_SF")
, c("FBATH", "ROOMS", "BLDG_SF")
, c("FBATH", "HBATH", "BLDG_SF")
, c("FBATH", "BLDG_SF")
, c("FBATH", "BLDG_SF")
, c("FBATH", "BLDG_SF")
, c("FBATH", "BLDG_SF")
, c("FBATH", "BLDG_SF")
, c("BLDG_SF")
, c("BLDG_SF")
, c("FBATH", "BEDS")
, c("BLDG_SF")
, c("FBATH")
, c("AGE_DECADE", "AGE_DECADE_SQRD", "BLDG_SF", "FBATH", "HBATH", "BEDS", "FRPL", "ROOF_CNST", "GAR1_SIZE", "GAR1_ATT", "BSMT", "EXT_WALL")
, c("AGE_DECADE", "AGE_DECADE_SQRD", "BLDG_SF", "FBATH", "HBATH", "BEDS", "ROOMS", "FRPL", "ROOF_CNST", "GAR1_SIZE", "GAR1_ATT", "BSMT", "EXT_WALL")
, c("AGE_DECADE", "AGE_DECADE_SQRD", "BLDG_SF", "FBATH", "HBATH", "EXT_WALL", "BEDS", "ROOF_CNST")
, c("BLDG_SF", "FBATH")
, c("BLD_SF", "FBATH")
, c("BLD_SF", "FBATH", "BSMT")
, c("EXT_WALL")
, c("AGE_DECADE", "AGE_DECADE_SQRD", "BLDG_SF", "FBATH", "HBATH", "GAR1_SIZE")
, c("BLDG_SF")
outlier_fields <- list(c("BLDG_SF", "BSMT", "FBATH", "EXT_WALL")
)
outlier_changes <- list(c("3", "2815")
, c("5", "1", "5000")
, c("1", "3672")
, c("1")
, c("5", "5200", "14")
, c("6", "2", "6850")
, c("5", "3", "8192")
, c("5", "9", "3110")
, c("4", "9", "1", "3840")
, c("3", "9", "2728")
, c("5", "1", "5180")
, c("3", "3215")
, c("3", "2150")
, c("6", "3982")
, c("7", "4923")
, c("4", "4500")
, c("2900")
, c("6947")
, c("8", "9")
, c("1190")
, c("6")
, c("0.4", "0.16", "5000", "5", "1", "6", "1", "4", "2", "2", "1", "1")
, c("0.1", "0.01", "6270", "5", "1", "6", "10", "2", "6", "2", "2", "1", "2")
, c("0.1", "0.01", "4805", "5", "1", "1", "6", "1")
, c("4630", "5")
, c("4000", "3")
, c("5690", "7", "1")
, c("4")
, c("0.6", "0.36", "7500", "8", "2", "2")
, c("5200")
outlier_changes <- list(c("1508", "1", "2", "2")
)
# create subset of VALUATIONDATA only including those pins
......@@ -161,5 +74,8 @@ if (length(outlier_pins) == length(outlier_fields) & length(outlier_fields) == l
write.table(outlier_corrections, file = paste0("O:/CCAODATA/data/bad_data/corrected_outliers_", target_township, "_2019.csv"), row.names = FALSE, sep = ",")
}else{
print("ERROR | input lists are NOT EQUAL; you've bungled it, good work (╯ ͡° ▽ ͡°)╯︵ ┻━┻")
}
\ No newline at end of file
print("ERROR | input lists are NOT EQUAL; you've bungled it, good work (╯ ͡° ▽ ͡°)╯︵ ┻━┻")
}
save(outlier_corrections, outlier_pins, outlier_fields, outlier_changes
, file=paste0("O:/CCAODATA/data/bad_data/corrected_outliers_", target_township, "_2019.Rda"))
\ No newline at end of file
......@@ -50,9 +50,9 @@ dirs <- list(code=paste0(wd,"/code.r/")
rm(common.drive, wd)
# What townships are we pulling data for?
modeling_townships <- "16, 22, 35, 38"
modeling_townships <- "22, 23, 25, 38"
# What township are we assessming?
target_township <- 16
target_township <- 25
# NORTH TRI = 17, 23, 10, 16, 18, 20, 22, , 24, 25, 26, 29, 30, 26
# Rogers Park = 75
# What is the lower limit on sale dates?
......@@ -139,10 +139,13 @@ source(paste0(dirs$code, "3_sf_valuation.R"))
# Push outliers to desktop review
source(paste0(dirs$code, "4_sf_desktop_review.R"))
integrity_checks
# Export and report on values
source(paste0(dirs$code, "5_sf_export&visualize.R"))
# Compare CAMA and legacy valuations
source(paste0(dirs$code, "6_legacy_comparison.R"))
#source(paste0(dirs$code, "6_legacy_comparison.R"))
# Recover desk review
source(paste0(dirs$code, "7_recover_desktop_reviews.R"))