2022年3月7日月曜日

House Price@Kaggle with TensorFlow

Because I completed a Coursera tensorflow course, I tried a Kaggle quiz.

Probably there is a lot of room to tune, but at least this code works.
I know this is super awkward, but at the same time, this may help someone.


import pandas as pd
import numpy as np
import tensorflow as tf

FEATURES = []
FEAT_CONT=["LotFrontage", "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "MasVnrArea", "TotalBsmtSF",
"1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath",
"TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
"ScreenPorch", "PoolArea", "MiscVal"]
FEAT_CAT = [ "MSSubClass", "MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood",
"Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
"MasVnrType", "ExterQual", "ExterCond", "Foundation", "Heating", "HeatingQC", "CentralAir", "Electrical", "KitchenQual", "Functional",
"FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "PoolQC", "Fence", "MiscFeature",
"SaleType", "SaleCondition"]

def get_input_fn(data_set, num_epochs=None, shuffle=True):
  return tf.compat.v1.estimator.inputs.pandas_input_fn(
      x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
      y=pd.Series(data_set["SalePrice"].values),
      num_epochs=num_epochs,
      shuffle=shuffle)

prediction_set = pd.read_csv("test.csv")
prediction_set.dropna(how='all', axis=1,inplace=True)
prediction_set.fillna(0,inplace = True)
prediction_set = prediction_set.drop(["Id"],axis=1)
prediction_set["SalePrice"]=0.0
for i in FEAT_CAT:
    prediction_set[i][prediction_set[i]==0]="0"

training_set = pd.read_csv("train.csv")
training_set.dropna(how='all', axis=1,inplace=True)
training_set.fillna(0,inplace = True)
training_set = training_set.drop(["Id"],axis=1)

for i in FEAT_CAT:
    training_set[i][training_set[i]==0]="0"
test_set     = training_set.iloc[1400:,:]
training_set = training_set.iloc[:1400,:]


FEATURES = FEAT_CONT + FEAT_CAT
feature_cols = [tf.feature_column.numeric_column(k) for k in FEAT_CONT]
feature_cat = []
for i in FEAT_CAT:
    data = training_set[i].values.tolist()
    data = set(data)
    feature_cat.append(tf.feature_column.embedding_column(tf.feature_column.categorical_column_with_vocabulary_list(key=i,vocabulary_list=tuple(data)),16 ))

numN = len(FEATURES)
regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols, hidden_units=[300,150,75,30], model_dir=None)

regressor.train(input_fn=get_input_fn(training_set), steps=5000)

ev = regressor.evaluate(input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False))
loss_score = ev["loss"]
print("Loss: {0:f}".format(loss_score))
y = regressor.predict(input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False))
pred = []
for i in y:
    pred.append((i["predictions"][0]))
test_result = test_set["SalePrice"].values.tolist()

for i in range(len(pred)):
    print(i,pred[i],test_result[i])


y = regressor.predict(input_fn=get_input_fn(prediction_set, num_epochs=1, shuffle=False))
pred = []
for i in y:
    pred.append((i["predictions"][0]))

id = 1460
with open("output.csv","w") as f:
   for i in range(len(pred)):
      line = str(id+i+1)+","+str(pred[i]) +"\n"
      f.write(line)


0 件のコメント:

コメントを投稿