2.1.0
User Documentation for Apache MADlib
decision_tree.sql_in File Reference

Functions

void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text split_criterion, text grouping_cols, text weights, integer max_depth, integer min_split, integer min_bucket, integer n_bins, text pruning_params, text null_handling_params, boolean verbose_mode)
 Training of decision tree. More...
 
void __build_tree (boolean is_classification, text split_criterion, text training_table_name, text output_table_name, text id_col_name, text dependent_variable, boolean dep_is_bool, text list_of_features, varchar[] cat_features, varchar[] ordered_cat_features, varchar[] boolean_cats, varchar[] con_features, text grouping_cols, text weights, integer max_depth, integer min_split, integer min_bucket, integer n_bins, text cp_table, smallint max_n_surr, text msg_level, text null_proxy, integer n_folds)
 
text tree_train (text message)
 
text tree_train ()
 
bytea8 _dst_compute_con_splits_transition (bytea8 state, float8[] con_features, integer n_per_seg, smallint num_splits)
 
bytea8 _dst_compute_con_splits_final (bytea8 state)
 
aggregate bytea8 _dst_compute_con_splits (float8[], integer, smallint)
 
integer [] _dst_compute_entropy_transition (integer[] state, integer encoded_dep_var, integer num_dep_var)
 
integer [] _dst_compute_entropy_merge (integer[] state1, integer[] state2)
 
float8 _dst_compute_entropy_final (integer[] state)
 
aggregate float8 _dst_compute_entropy (integer, integer)
 
integer [] _map_catlevel_to_int (text[] cat_values_in_text, text[] cat_levels_in_text, integer[] cat_n_levels, boolean null_as_category)
 
bytea8 _initialize_decision_tree (boolean is_regression_tree, text impurity_function, smallint num_response_labels, smallint max_n_surr)
 
bytea8 _compute_leaf_stats_transition (bytea8 state, bytea8 tree_state, integer[] cat_features, float8[] con_features, float8 response, float8 weight, integer[] cat_levels, bytea8 con_splits, smallint n_response_labels, boolean weights_as_rows)
 
bytea8 _compute_leaf_stats_merge (bytea8 state1, bytea8 state2)
 
aggregate bytea8 _compute_leaf_stats (bytea8, integer[], float8[], float8, float8, integer[], bytea8, smallint, boolean)
 
_tree_result_type _dt_apply (bytea8 tree, bytea8 state, bytea8 con_splits, smallint min_split, smallint min_bucket, smallint max_depth, boolean subsample, integer num_random_features)
 
bytea8 _compute_surr_stats_transition (bytea8 state, bytea8 tree_state, integer[] cat_features, float8[] con_features, integer[] cat_levels, bytea8 con_splits, integer dup_count)
 
aggregate bytea8 _compute_surr_stats (bytea8, integer[], float8[], integer[], bytea8, integer)
 
bytea8 _dt_surr_apply (bytea8 tree, bytea8 state, bytea8 con_splits)
 
_flattened_tree _print_decision_tree (bytea8 tree)
 
float8 [] _compute_var_importance (bytea8 tree, integer n_cat_features, integer n_con_features)
 
float8 _predict_dt_response (bytea8 tree, integer[] cat_features, float8[] con_features)
 
float8 [] _predict_dt_prob (bytea8 tree, integer[] cat_features, float8[] con_features)
 
void tree_predict (text model, text source, text output, text pred_type)
 Use decision tree model to make predictions. More...
 
void __tree_predict (text model, text source, text output, text pred_type, boolean use_existing_tables, integer k)
 
void tree_predict (text model, text source, text output)
 
text tree_predict (text message)
 
text tree_predict ()
 
varchar tree_surr_display (text model_table)
 Display decision tree in dot or text format. More...
 
varchar tree_surr_display ()
 
varchar tree_display (text model_table, boolean dot_format, boolean verbose)
 Display decision tree in dot or text format. More...
 
varchar tree_display (text model_table, boolean dot_format)
 
varchar tree_display (text model_table)
 
varchar tree_display ()
 
text _display_decision_tree (bytea8 tree, text[] cat_features, text[] con_features, text[] cat_levels_in_text, integer[] cat_n_levels, text[] dependent_levels, text id_prefix, boolean verbose)
 
text _display_decision_tree (bytea8 tree, text[] cat_features, text[] con_features, text[] cat_levels_in_text, integer[] cat_n_levels, text[] dependent_levels, text id_prefix)
 
text _display_decision_tree_surrogate (bytea8 tree, text[] cat_features, text[] con_features, text[] cat_levels_in_text, integer[] cat_n_levels)
 
text _display_text_decision_tree (bytea8 tree, text[] cat_features, text[] con_features, text[] cat_levels_in_text, integer[] cat_n_levels, text[] dependent_levels)
 
set< _cat_levels_type > _gen_cat_levels_set (text[] grp_keys, integer[] cat_n_levels, integer n_cat, text[] cat_sorted_origin)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text split_criterion, text grouping_cols, text weights, integer max_depth, integer min_split, integer min_bucket, integer n_bins, text pruning_params, text null_handling_params)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text split_criterion, text grouping_cols, text weights, integer max_depth, integer min_split, integer min_bucket, integer n_bins, text pruning_params)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text split_criterion, text grouping_cols, text weights, integer max_depth, integer min_split, integer min_bucket, integer n_bins)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text split_criterion, text grouping_cols, text weights, integer max_depth, integer min_split, integer min_bucket)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text split_criterion, text grouping_cols, text weights, integer max_depth, integer min_split)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text split_criterion, text grouping_cols, text weights, integer max_depth)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text split_criterion, text grouping_cols, text weights)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text split_criterion, text grouping_cols)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text split_criterion)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude)
 
void tree_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features)
 

Function Documentation

◆ __build_tree()

void __build_tree ( boolean  is_classification,
text  split_criterion,
text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
boolean  dep_is_bool,
text  list_of_features,
varchar []  cat_features,
varchar []  ordered_cat_features,
varchar []  boolean_cats,
varchar []  con_features,
text  grouping_cols,
text  weights,
integer  max_depth,
integer  min_split,
integer  min_bucket,
integer  n_bins,
text  cp_table,
smallint  max_n_surr,
text  msg_level,
text  null_proxy,
integer  n_folds 
)

◆ __tree_predict()

void __tree_predict ( text  model,
text  source,
text  output,
text  pred_type,
boolean  use_existing_tables,
integer  k 
)

◆ _compute_leaf_stats()

aggregate bytea8 _compute_leaf_stats ( bytea8  ,
integer  [],
float8  [],
float8  ,
float8  ,
integer  [],
bytea8  ,
smallint  ,
boolean   
)

◆ _compute_leaf_stats_merge()

bytea8 _compute_leaf_stats_merge ( bytea8  state1,
bytea8  state2 
)

◆ _compute_leaf_stats_transition()

bytea8 _compute_leaf_stats_transition ( bytea8  state,
bytea8  tree_state,
integer []  cat_features,
float8 []  con_features,
float8  response,
float8  weight,
integer []  cat_levels,
bytea8  con_splits,
smallint  n_response_labels,
boolean  weights_as_rows 
)

◆ _compute_surr_stats()

aggregate bytea8 _compute_surr_stats ( bytea8  ,
integer  [],
float8  [],
integer  [],
bytea8  ,
integer   
)

◆ _compute_surr_stats_transition()

bytea8 _compute_surr_stats_transition ( bytea8  state,
bytea8  tree_state,
integer []  cat_features,
float8 []  con_features,
integer []  cat_levels,
bytea8  con_splits,
integer  dup_count 
)

◆ _compute_var_importance()

float8 [] _compute_var_importance ( bytea8  tree,
integer  n_cat_features,
integer  n_con_features 
)

◆ _display_decision_tree() [1/2]

text _display_decision_tree ( bytea8  tree,
text []  cat_features,
text []  con_features,
text []  cat_levels_in_text,
integer []  cat_n_levels,
text []  dependent_levels,
text  id_prefix,
boolean  verbose 
)

◆ _display_decision_tree() [2/2]

text _display_decision_tree ( bytea8  tree,
text []  cat_features,
text []  con_features,
text []  cat_levels_in_text,
integer []  cat_n_levels,
text []  dependent_levels,
text  id_prefix 
)

◆ _display_decision_tree_surrogate()

text _display_decision_tree_surrogate ( bytea8  tree,
text []  cat_features,
text []  con_features,
text []  cat_levels_in_text,
integer []  cat_n_levels 
)

◆ _display_text_decision_tree()

text _display_text_decision_tree ( bytea8  tree,
text []  cat_features,
text []  con_features,
text []  cat_levels_in_text,
integer []  cat_n_levels,
text []  dependent_levels 
)

◆ _dst_compute_con_splits()

aggregate bytea8 _dst_compute_con_splits ( float8  [],
integer  ,
smallint   
)

◆ _dst_compute_con_splits_final()

bytea8 _dst_compute_con_splits_final ( bytea8  state)

◆ _dst_compute_con_splits_transition()

bytea8 _dst_compute_con_splits_transition ( bytea8  state,
float8 []  con_features,
integer  n_per_seg,
smallint  num_splits 
)

◆ _dst_compute_entropy()

aggregate float8 _dst_compute_entropy ( integer  ,
integer   
)

◆ _dst_compute_entropy_final()

float8 _dst_compute_entropy_final ( integer []  state)

◆ _dst_compute_entropy_merge()

integer [] _dst_compute_entropy_merge ( integer []  state1,
integer []  state2 
)

◆ _dst_compute_entropy_transition()

integer [] _dst_compute_entropy_transition ( integer []  state,
integer  encoded_dep_var,
integer  num_dep_var 
)

◆ _dt_apply()

_tree_result_type _dt_apply ( bytea8  tree,
bytea8  state,
bytea8  con_splits,
smallint  min_split,
smallint  min_bucket,
smallint  max_depth,
boolean  subsample,
integer  num_random_features 
)

◆ _dt_surr_apply()

bytea8 _dt_surr_apply ( bytea8  tree,
bytea8  state,
bytea8  con_splits 
)

◆ _gen_cat_levels_set()

set<_cat_levels_type> _gen_cat_levels_set ( text []  grp_keys,
integer []  cat_n_levels,
integer  n_cat,
text []  cat_sorted_origin 
)

◆ _initialize_decision_tree()

bytea8 _initialize_decision_tree ( boolean  is_regression_tree,
text  impurity_function,
smallint  num_response_labels,
smallint  max_n_surr 
)

◆ _map_catlevel_to_int()

integer [] _map_catlevel_to_int ( text []  cat_values_in_text,
text []  cat_levels_in_text,
integer []  cat_n_levels,
boolean  null_as_category 
)

◆ _predict_dt_prob()

float8 [] _predict_dt_prob ( bytea8  tree,
integer []  cat_features,
float8 []  con_features 
)

◆ _predict_dt_response()

float8 _predict_dt_response ( bytea8  tree,
integer []  cat_features,
float8 []  con_features 
)

◆ _print_decision_tree()

_flattened_tree _print_decision_tree ( bytea8  tree)

◆ tree_display() [1/4]

varchar tree_display ( text  model_table,
boolean  dot_format,
boolean  verbose 
)
Parameters
tree_modelName of the table containing the decision tree model

◆ tree_display() [2/4]

varchar tree_display ( text  model_table,
boolean  dot_format 
)

◆ tree_display() [3/4]

varchar tree_display ( text  model_table)

◆ tree_display() [4/4]

varchar tree_display ( )

◆ tree_predict() [1/4]

void tree_predict ( text  model,
text  source,
text  output,
text  pred_type 
)
Parameters
modelName of the table containing the decision tree model
sourceName of table containing prediction data
outputName of table to output prediction results
pred_typeOPTIONAL (Default = 'response'). For regression trees, 'response', implies output is the predicted value. For classification trees, this can be 'response', giving the classification prediction as output, or ‘prob’, giving the class probabilities as output (for two classes, only a single probability value is output that corresponds to the first class when the two classes are sorted by name; in case of more than two classes, an array of class probabilities (a probability of each class) is output).

See Decision Tree for more details.

◆ tree_predict() [2/4]

void tree_predict ( text  model,
text  source,
text  output 
)

◆ tree_predict() [3/4]

text tree_predict ( text  message)

◆ tree_predict() [4/4]

text tree_predict ( )

◆ tree_surr_display() [1/2]

varchar tree_surr_display ( text  model_table)
Parameters
tree_modelName of the table containing the decision tree model

◆ tree_surr_display() [2/2]

varchar tree_surr_display ( )

◆ tree_train() [1/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  split_criterion,
text  grouping_cols,
text  weights,
integer  max_depth,
integer  min_split,
integer  min_bucket,
integer  n_bins,
text  pruning_params,
text  null_handling_params,
boolean  verbose_mode 
)
Parameters
split_criterionVarious options to compute the feature to split a node. Available options are 'gini', 'cross-entropy', and 'misclassification'. The "cart" algorithm provides an additional option of 'mse'.
training_table_nameName of the table containing data.
output_table_nameName of the table to output the model.
id_col_nameName of column containing the id information in training data.
dependent_variableName of the column that contains the output for training. Boolean, integer and text are considered classification outputs, while float values are considered regression outputs.
list_of_featuresList of column names (comma-separated string) to use as predictors. Can also be a ‘*’ implying all columns are to be used as predictors (except the ones included in the next argument). Boolean, integer, and text columns are considered categorical columns.
list_of_features_to_excludeOPTIONAL. List of column names (comma-separated string) to exlude from the predictors list.
grouping_colsOPTIONAL. List of column names (comma-separated string) to group the data by. This will lead to creating multiple decision trees, one for each group.
weightsOPTIONAL. Column name containing weights for each observation.
max_depthOPTIONAL (Default = 7). Set the maximum depth of any node of the final tree, with the root node counted as depth 0. A deeper tree can lead to better prediction but will also result in longer processing time and higher memory usage.
min_splitOPTIONAL (Default = 20). Minimum number of observations that must exist in a node for a split to be attempted.
min_bucketOPTIONAL (Default = minsplit/3). Minimum number of observations in any terminal node. If only one of minbucket or minsplit is specified, minsplit is set to minbucket*3 or minbucket to minsplit/3, as appropriate.
n_binsoptional (default = 20) number of bins to use during binning. continuous-valued features are binned into discrete bins (per the quartile values) to compute split bound- aries. this global parameter is used to compute the resolution of the bins. higher number of bins will lead to higher processing time.
pruning_params(default: cp=0) pruning parameter string containing key-value pairs. the keys can be: cp (default = 0.01) a complexity parameter that determines that a split is attempted only if it decreases the overall lack of fit by a factor of ‘cp’. n_folds (default = 0) number of cross-validation folds
verbose_modeoptional (default = false) prints status information on the splits performed and any other information useful for debugging.

see Decision Tree for more details.

◆ tree_train() [2/14]

text tree_train ( text  message)

◆ tree_train() [3/14]

text tree_train ( )

◆ tree_train() [4/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  split_criterion,
text  grouping_cols,
text  weights,
integer  max_depth,
integer  min_split,
integer  min_bucket,
integer  n_bins,
text  pruning_params,
text  null_handling_params 
)

◆ tree_train() [5/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  split_criterion,
text  grouping_cols,
text  weights,
integer  max_depth,
integer  min_split,
integer  min_bucket,
integer  n_bins,
text  pruning_params 
)

◆ tree_train() [6/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  split_criterion,
text  grouping_cols,
text  weights,
integer  max_depth,
integer  min_split,
integer  min_bucket,
integer  n_bins 
)

◆ tree_train() [7/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  split_criterion,
text  grouping_cols,
text  weights,
integer  max_depth,
integer  min_split,
integer  min_bucket 
)

◆ tree_train() [8/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  split_criterion,
text  grouping_cols,
text  weights,
integer  max_depth,
integer  min_split 
)

◆ tree_train() [9/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  split_criterion,
text  grouping_cols,
text  weights,
integer  max_depth 
)

◆ tree_train() [10/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  split_criterion,
text  grouping_cols,
text  weights 
)

◆ tree_train() [11/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  split_criterion,
text  grouping_cols 
)

◆ tree_train() [12/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  split_criterion 
)

◆ tree_train() [13/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude 
)

◆ tree_train() [14/14]

void tree_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features 
)