# people_income_lgbm.py
# predict income from sex, age, State, politics

import numpy as np
import lightgbm as lgbm

# -----------------------------------------------------------

def accuracy(model, data_x, data_y, pct_close):
  n = len(data_x)
  n_correct = 0; n_wrong = 0
  for i in range(n):
    x = data_x[i].reshape(1, -1)
    y = data_y[i]  # true income
    pred = model.predict(x)  # predicted income []
    if np.abs(pred[0] - y) < np.abs(pct_close * y):
      n_correct += 1
    else:
      n_wrong += 1
  return (n_correct * 1.0) / (n_correct + n_wrong)

# -----------------------------------------------------------

def accuracy_matrix(model, data_x, data_y,
  pct_close, points):
  n_intervals = len(points) - 1
  result = np.zeros((n_intervals,2), dtype=np.int64)
  # n_corrects in col [0], n_wrongs in col [1]
  for i in range(len(data_x)):
    x = data_x[i].reshape(1, -1) 
    y = data_y[i]                  # true income
    pred = model.predict(x)        # predicted income []

    interval = 0
    for i in range(n_intervals):
      if y >= points[i] and y < points[i+1]:
        interval = i; break

    if np.abs(pred[0] - y) < np.abs(pct_close * y):
      result[interval][0] += 1
    else:
      result[interval][1] += 1
  return result

# -----------------------------------------------------------

def show_acc_matrix(am, points):
  h = "from      to         correct  wrong   count    accuracy"
  print("  " + h)
  for i in range(len(am)):
    print("%10.2f" % points[i], end="")
    print("%10.2f" % points[i+1], end="")
    print("%8d" % am[i][0], end ="")
    print("%8d" % am[i][1], end ="")
    count = am[i][0] + am[i][1]
    print("%8d" % count, end="")
    if count == 0:
      acc = 0.0
    else:
      acc = am[i][0] / count
    print("%12.4f" % acc)

# -----------------------------------------------------------

def main():
  # 0. get started
  print("\nBegin People predict income using LightGBM ")
  print("Predict income from sex, age, State, politics ")
  np.random.seed(1)

  # 1. load data
  # sex, age, State, income, politics
  #  0    1     2       3       4
  print("\nLoading train and test data ")
  train_file = ".\\Data\\people_train.txt"
  test_file = ".\\Data\\people_test.txt"

  x_train = np.loadtxt(train_file, usecols=[0,1,2,4],
    delimiter=",", comments="#", dtype=np.float64)
  y_train = np.loadtxt(train_file, usecols=3,
    delimiter=",", comments="#", dtype=np.float64)

  x_test = np.loadtxt(test_file, usecols=[0,1,2,4],
    delimiter=",", comments="#", dtype=np.float64)
  y_test = np.loadtxt(test_file, usecols=3,
    delimiter=",", comments="#", dtype=np.float64)

  np.set_printoptions(precision=0, suppress=True)
  print("\nFirst few train data: ")
  for i in range(3):
    print(x_train[i], end="")
    print("  | " + str(y_train[i]))
  print(". . . ")

  # 2. create and train model
  print("\nCreating and training LightGBM regression model ")
  params = {
    'objective': 'regression',  # not needed
    'boosting_type': 'gbdt',  # default
    'num_leaves': 31,  # default
    'learning_rate': 0.05,  # default = 0.10
    'feature_fraction': 1.0,  # default
    'min_data_in_leaf': 2,  # default = 20
    'random_state': 0,
    'verbosity': -1
  }
  model = lgbm.LGBMRegressor(**params)  # scikit API
  model.fit(x_train, y_train)
  print("Done ")

  # 3. evaluate model
  print("\nEvaluating model accuracy (within 0.07) ")
  acc_train = accuracy(model, x_train, y_train, 0.07)
  print("accuracy on train data = %0.4f " % acc_train)
  acc_test = accuracy(model, x_test, y_test, 0.07)
  print("accuracy on test data = %0.4f " % acc_test)

  inc_pts = \
    [0.00, 25000.00, 50000.00, 75000.00, 100000.00]
  am_train = \
    accuracy_matrix(model, x_train, y_train, 0.07, inc_pts)
  print("\nAccuracy on training data (within 0.07 of true):")
  show_acc_matrix(am_train, inc_pts)

  am_test = \
    accuracy_matrix(model, x_test, y_test, 0.07, inc_pts)
  print("\nAccuracy on test data (within 0.07 of true):")
  show_acc_matrix(am_test, inc_pts)

  # 4. use model
  print("\nPredicting income for M 35 Oklahoma moderate ")
  x = np.array([[0, 35, 2, 1]], dtype=np.float64)
  y_pred = model.predict(x)
  print("\nPredicted income = %0.2f " % y_pred[0])

  import pickle
  print("Saving model ")
  pth = ".\\Models\\income_model.pkl"
  with open(pth, "wb") as f:
    pickle.dump(model, f)


  print("\nEnd demo ")

# -----------------------------------------------------------

if __name__ == "__main__":
  main()
# training data:
# 
# people_train.txt
# sex (M = 0, F = 1)
# age
# State (Michigan = 0, Nebraska = 1, Oklahoma = 2)
# income
# politics (conservative = 0, moderate = 1, liberal = 2)
#
# 1,24,0,29500.00,2
# 0,39,2,51200.00,1
# 1,63,1,75800.00,0
# 0,36,0,44500.00,1
# 1,27,1,28600.00,2
# 1,50,1,56500.00,1
# 1,50,2,55000.00,1
# 0,19,2,32700.00,0
# 1,22,1,27700.00,1
# 0,39,2,47100.00,2
# 1,34,0,39400.00,1
# 0,22,0,33500.00,0
# 1,35,2,35200.00,2
# 0,33,1,46400.00,1
# 1,45,1,54100.00,1
# 1,42,1,50700.00,1
# 0,33,1,46800.00,1
# 1,25,2,30000.00,1
# 0,31,1,46400.00,0
# 1,27,0,32500.00,2
# 1,48,0,54000.00,1
# 0,64,1,71300.00,2
# 1,61,1,72400.00,0
# 1,54,2,61000.00,0
# 1,29,0,36300.00,0
# 1,50,2,55000.00,1
# 1,55,2,62500.00,0
# 1,40,0,52400.00,0
# 1,22,0,23600.00,2
# 1,68,1,78400.00,0
# 0,60,0,71700.00,2
# 0,34,2,46500.00,1
# 0,25,2,37100.00,0
# 0,31,1,48900.00,1
# 1,43,2,48000.00,1
# 1,58,1,65400.00,2
# 0,55,1,60700.00,2
# 0,43,1,51100.00,1
# 0,43,2,53200.00,1
# 0,21,0,37200.00,0
# 1,55,2,64600.00,0
# 1,64,1,74800.00,0
# 0,41,0,58800.00,1
# 1,64,2,72700.00,0
# 0,56,2,66600.00,2
# 1,31,2,36000.00,1
# 0,65,2,70100.00,2
# 1,55,2,64300.00,0
# 0,25,0,40300.00,0
# 1,46,2,51000.00,1
# 0,36,0,53500.00,0
# 1,52,1,58100.00,1
# 1,61,2,67900.00,0
# 1,57,2,65700.00,0
# 0,46,1,52600.00,1
# 0,62,0,66800.00,2
# 1,55,2,62700.00,0
# 0,22,2,27700.00,1
# 0,50,0,62900.00,0
# 0,32,1,41800.00,1
# 0,21,2,35600.00,0
# 1,44,1,52000.00,1
# 1,46,1,51700.00,1
# 1,62,1,69700.00,0
# 1,57,1,66400.00,0
# 0,67,2,75800.00,2
# 1,29,0,34300.00,2
# 1,53,0,60100.00,0
# 0,44,0,54800.00,1
# 1,46,1,52300.00,1
# 0,20,1,30100.00,1
# 0,38,0,53500.00,1
# 1,50,1,58600.00,1
# 1,33,1,42500.00,1
# 0,33,1,39300.00,1
# 1,26,1,40400.00,0
# 1,58,0,70700.00,0
# 1,43,2,48000.00,1
# 0,46,0,64400.00,0
# 1,60,0,71700.00,0
# 0,42,0,48900.00,1
# 0,56,2,56400.00,2
# 0,62,1,66300.00,2
# 0,50,0,64800.00,1
# 1,47,2,52000.00,1
# 0,67,1,80400.00,2
# 0,40,2,50400.00,1
# 1,42,1,48400.00,1
# 1,64,0,72000.00,0
# 0,47,0,58700.00,2
# 1,45,1,52800.00,1
# 0,25,2,40900.00,0
# 1,38,0,48400.00,0
# 1,55,2,60000.00,1
# 0,44,0,60600.00,1
# 1,33,0,41000.00,1
# 1,34,2,39000.00,1
# 1,27,1,33700.00,2
# 1,32,1,40700.00,1
# 1,42,2,47000.00,1
# 0,24,2,40300.00,0
# 1,42,1,50300.00,1
# 1,25,2,28000.00,2
# 1,51,1,58000.00,1
# 0,55,1,63500.00,2
# 1,44,0,47800.00,2
# 0,18,0,39800.00,0
# 0,67,1,71600.00,2
# 1,45,2,50000.00,1
# 1,48,0,55800.00,1
# 0,25,1,39000.00,1
# 0,67,0,78300.00,1
# 1,37,2,42000.00,1
# 0,32,0,42700.00,1
# 1,48,0,57000.00,1
# 0,66,2,75000.00,2
# 1,61,0,70000.00,0
# 0,58,2,68900.00,1
# 1,19,0,24000.00,2
# 1,38,2,43000.00,1
# 0,27,0,36400.00,1
# 1,42,0,48000.00,1
# 1,60,0,71300.00,0
# 0,27,2,34800.00,0
# 1,29,1,37100.00,0
# 0,43,0,56700.00,1
# 1,48,0,56700.00,1
# 1,27,2,29400.00,2
# 0,44,0,55200.00,0
# 1,23,1,26300.00,2
# 0,36,1,53000.00,2
# 1,64,2,72500.00,0
# 1,29,2,30000.00,2
# 0,33,0,49300.00,1
# 0,66,1,75000.00,2
# 0,21,2,34300.00,0
# 1,27,0,32700.00,2
# 1,29,0,31800.00,2
# 0,31,0,48600.00,1
# 1,36,2,41000.00,1
# 1,49,1,55700.00,1
# 0,28,0,38400.00,0
# 0,43,2,56600.00,1
# 0,46,1,58800.00,1
# 1,57,0,69800.00,0
# 0,52,2,59400.00,1
# 0,31,2,43500.00,1
# 0,55,0,62000.00,2
# 1,50,0,56400.00,1
# 1,48,1,55900.00,1
# 0,22,2,34500.00,0
# 1,59,2,66700.00,0
# 1,34,0,42800.00,2
# 0,64,0,77200.00,2
# 1,29,2,33500.00,2
# 0,34,1,43200.00,1
# 0,61,0,75000.00,2
# 1,64,2,71100.00,0
# 0,29,0,41300.00,0
# 1,63,1,70600.00,0
# 0,29,1,40000.00,0
# 0,51,0,62700.00,1
# 0,24,2,37700.00,0
# 1,48,1,57500.00,1
# 1,18,0,27400.00,0
# 1,18,0,20300.00,2
# 1,33,1,38200.00,2
# 0,20,2,34800.00,0
# 1,29,2,33000.00,2
# 0,44,2,63000.00,0
# 0,65,2,81800.00,0
# 0,56,0,63700.00,2
# 0,52,2,58400.00,1
# 0,29,1,48600.00,0
# 0,47,1,58900.00,1
# 1,68,0,72600.00,2
# 1,31,2,36000.00,1
# 1,61,1,62500.00,2
# 1,19,1,21500.00,2
# 1,38,2,43000.00,1
# 0,26,0,42300.00,0
# 1,61,1,67400.00,0
# 1,40,0,46500.00,1
# 0,49,0,65200.00,1
# 1,56,0,67500.00,0
# 0,48,1,66000.00,1
# 1,52,0,56300.00,2
# 0,18,0,29800.00,0
# 0,56,2,59300.00,2
# 0,52,1,64400.00,1
# 0,18,1,28600.00,1
# 0,58,0,66200.00,2
# 0,39,1,55100.00,1
# 0,46,0,62900.00,1
# 0,40,1,46200.00,1
# 0,60,0,72700.00,2
# 1,36,1,40700.00,2
# 1,44,0,52300.00,1
# 1,28,0,31300.00,2
# 1,54,2,62600.00,0

# people_test.txt
#
# 0,51,0,61200.00,1
# 0,32,1,46100.00,1
# 1,55,0,62700.00,0
# 1,25,2,26200.00,2
# 1,33,2,37300.00,2
# 0,29,1,46200.00,0
# 1,65,0,72700.00,0
# 0,43,1,51400.00,1
# 0,54,1,64800.00,2
# 1,61,1,72700.00,0
# 1,52,1,63600.00,0
# 1,30,1,33500.00,2
# 1,29,0,31400.00,2
# 0,47,2,59400.00,1
# 1,39,1,47800.00,1
# 1,47,2,52000.00,1
# 0,49,0,58600.00,1
# 0,63,2,67400.00,2
# 0,30,0,39200.00,0
# 0,61,2,69600.00,2
# 0,47,2,58700.00,1
# 1,30,2,34500.00,2
# 0,51,2,58000.00,1
# 0,24,0,38800.00,1
# 0,49,0,64500.00,1
# 1,66,2,74500.00,0
# 0,65,0,76900.00,0
# 0,46,1,58000.00,0
# 0,45,2,51800.00,1
# 0,47,0,63600.00,0
# 0,29,0,44800.00,0
# 0,57,2,69300.00,2
# 0,20,0,28700.00,2
# 0,35,0,43400.00,1
# 0,61,2,67000.00,2
# 0,31,2,37300.00,1
# 1,18,0,20800.00,2
# 1,26,2,29200.00,2
# 0,28,0,36400.00,2
# 0,59,2,69400.00,2

