SQL functions for Latent Dirichlet Allocation. More...

Functions
set< lda_result >	lda_train (text data_table, text model_table, text output_data_table, int4 voc_size, int4 topic_num, int4 iter_num, float8 alpha, float8 beta)
	This UDF provides an entry for the lda training process. More...

set< lda_result >	lda_train (text data_table, text model_table, text output_data_table, int4 voc_size, int4 topic_num, int4 iter_num, float8 alpha, float8 beta, int4 evaluate_every, float8 perplexity_tol)

set< lda_result >	lda_train (text data_table, text model_table, text output_data_table, int4 voc_size, int4 topic_num, int4 iter_num, float8 alpha, float8 beta, int4 evaluate_every)

set< lda_result >	lda_predict (text data_table, text model_table, text output_table)
	This UDF provides an entry for the lda predicton process. More...

set< lda_result >	lda_predict (text data_table, text model_table, text output_table, int4 iter_num)
	A overloaded version which allows users to specify iter_num. More...

set< lda_result >	lda_get_topic_word_count (text model_table, text output_table)
	This UDF computes the per-topic word counts. More...

set< lda_result >	lda_get_word_topic_count (text model_table, text output_table)
	This UDF computes the per-word topic counts. More...

set< lda_result >	lda_get_topic_desc (text model_table, text vocab_table, text desc_table, int4 top_k)
	This UDF gets the description for each topic (top-k words) More...

set< lda_result >	lda_get_word_topic_mapping (text lda_output_table, text mapping_table)
	This UDF gets the wordid - topicid mapping from the lda training output table. More...

int4 []	__lda_random_assign (int4 word_count, int4 topic_num)
	This UDF assigns topics to words in a document randomly. More...

int4 []	__lda_gibbs_sample (int4[] words, int4[] counts, int4[] doc_topic, int8[] model, float8 alpha, float8 beta, int4 voc_size, int4 topic_num, int4 iter_num)
	This UDF learns the topics of words in a document and is the main step of a Gibbs sampling iteration. The model parameter (including the per-word topic counts and corpus-level topic counts) is passed to this function in the first call and then transfered to the rest calls through fcinfo->flinfo->fn_extra to allow the immediate update. More...

int8 []	__lda_count_topic_sfunc (int8[] state, int4[] words, int4[] counts, int4[] topic_assignment, int4 voc_size, int4 topic_num)
	This UDF is the sfunc for the aggregator computing the topic counts for each word and the topic count in the whole corpus. It scans the topic assignments in a document and updates the topic counts. More...

int8 []	__lda_count_topic_prefunc (int8[] state1, int8[] state2)
	This UDF is the prefunc for the aggregator computing the per-word topic counts. More...

aggregate int8 []	__lda_count_topic_agg (int4[], int4[], int4[], int4, int4)
	This uda computes the word topic counts by scanning and summing up topic assignments in each document. More...

float8	lda_get_perplexity (text model_table, text output_data_table)
	This UDF computes the perplexity given the output data table and the model table. More...

int8 []	__lda_perplexity_sfunc (int8[] state, int4[] words, int4[] counts, int4[] doc_topic, int8[] model, float8 alpha, float8 beta, int4 voc_size, int4 topic_num)
	This UDF is the sfunc for the aggregator computing the perpleixty. More...

int8 []	__lda_perplexity_prefunc (int8[] state1, int8[] state2)
	This UDF is the prefunc for the aggregator computing the perplexity. More...

float8	__lda_perplexity_ffunc (int8[] state)
	This UDF is the finalfunc for the aggregator computing the perplexity. More...

aggregate int8 []	__lda_perplexity_agg (int4[], int4[], int4[], int8[], float8, float8, int4, int4)

int4 []	__lda_check_count_ceiling (int8[] arr, int4 r, int4 c)

set< int4[]>	__lda_util_unnest (int8[] arr, int4 r, int4 c)
	Unnest a 2-D array into a set of 1-D arrays. More...

set< int4[]>	__lda_util_unnest_transpose (int8[] arr, int4 r, int4 c)

int8 [][]	__lda_util_transpose (int8[][] matrix)
	Transpose a 2-D array. More...

float8 []	__lda_util_norm_with_smoothing (float8[] arr, float8 smooth)
	L1 normalization with smoothing. More...

int4 []	__lda_util_index_sort (float8[] arr)
	This UDF returns the index of elements in a sorted order. More...

set< lda_result >	__lda_util_norm_vocab (text vocab_table, text output_vocab_table)
	This UDF checks the vocabulary and converts non-continous wordids into continuous integers ranging from 0 to voc_size - 1. More...

set< lda_result >	__lda_util_norm_dataset (text data_table, text norm_vocab_table, text output_data_table)
	This UDF converts the data table according to the normalized vocabulary, and all rows with non-positive count values will be removed. More...

set< lda_result >	__lda_util_conorm_data (text data_table, text vocab_table, text output_data_table, text output_vocab_table)
	This UDF extracts the list of wordids from the data table and joins it with the vocabulary table to get the list of common wordids, next it will normalize the vocabulary based on the common wordids and then normalize the data table based on the normalized vocabulary. More...

_pivotalr_lda_model	lda_parse_model (bigint[] lda_model, integer voc_size, integer topic_num)

Detailed Description

Date: Dec 2012

See also: For an introduction to Latent Dirichlet Allocation models, see the module description Latent Dirichlet Allocation.

Function Documentation

◆ __lda_check_count_ceiling()

int4 [] __lda_check_count_ceiling	(	int8 []	arr,
		int4	r,
		int4	c
	)

◆ __lda_count_topic_agg()

aggregate int8 [] __lda_count_topic_agg	(	int4	[],
		int4	[],
		int4	[],
		int4	,
		int4
	)

Parameters

words	The unique words in the document
counts	The counts of each unique words in the document
topic_assignment	The topic assignments in the document
voc_size	The size of vocabulary
topic_num	The number of topics

Returns: The word topic counts (a 1-d array embeding a 2-d array)

◆ __lda_count_topic_prefunc()

int8 [] __lda_count_topic_prefunc	(	int8 []	state1,
		int8 []	state2
	)

Parameters

state1	The local word topic counts
state2	The local word topic counts

Returns: The element-wise sum of two local states

◆ __lda_count_topic_sfunc()

int8 [] __lda_count_topic_sfunc	(	int8 []	state,
		int4 []	words,
		int4 []	counts,
		int4 []	topic_assignment,
		int4	voc_size,
		int4	topic_num
	)

Parameters

state	The topic counts
words	The unique words in the document
counts	The counts of each unique words in the document (sum(counts) = word_count)
topic_assignment	The topic assignments in the document
voc_size	The size of vocabulary
topic_num	The number of topics

Returns: The updated state

◆ __lda_gibbs_sample()

int4 [] __lda_gibbs_sample	(	int4 []	words,
		int4 []	counts,
		int4 []	doc_topic,
		int8 []	model,
		float8	alpha,
		float8	beta,
		int4	voc_size,
		int4	topic_num,
		int4	iter_num
	)

Parameters

words	The set of unique words in the document
counts	The counts of each unique words in the document (sum(counts) = word_count)
doc_topic	The current per-doc topic counts and topic assignments
model	The current model (including the per-word topic counts and the corpus-level topic counts)
alpha	The Dirichlet parameter for per-document topic multinomial
beta	The Dirichlet parameter for per-topic word multinomial
voc_size	The size of vocabulary
topic_num	The number of topics
iter_num	The number of iterations

Returns: The learned topic counts and topic assignments

◆ __lda_perplexity_agg()

aggregate int8 [] __lda_perplexity_agg	(	int4	[],
		int4	[],
		int4	[],
		int8	[],
		float8	,
		float8	,
		int4	,
		int4
	)

◆ __lda_perplexity_ffunc()

float8 __lda_perplexity_ffunc ( int8 [] state )

Parameters

state The merged state

Returns: The perpleixty

◆ __lda_perplexity_prefunc()

int8 [] __lda_perplexity_prefunc	(	int8 []	state1,
		int8 []	state2
	)

Parameters

state1	The local state
state2	The local state

Returns: The merged state

◆ __lda_perplexity_sfunc()

int8 [] __lda_perplexity_sfunc	(	int8 []	state,
		int4 []	words,
		int4 []	counts,
		int4 []	doc_topic,
		int8 []	model,
		float8	alpha,
		float8	beta,
		int4	voc_size,
		int4	topic_num
	)

Parameters

state	The cached model plus perplexity
words	The unique words in the document
counts	The counts of each unique words in the document
doc_topic	The topic counts in the document
model	The learned model
alpha	The Dirichlet parameter for per-document topic multinomial
beta	The Dirichlet parameter for per-topic word multinomial
voc_size	The size of vocabulary
topic_num	The number of topics

Returns: The updated state

◆ __lda_random_assign()

int4 [] __lda_random_assign	(	int4	word_count,
		int4	topic_num
	)

Parameters

word_count	The number of words in the document
topic_num	The number of topics (specified by the user)

Returns: The topic counts and topic assignments

◆ __lda_util_conorm_data()

set<lda_result> __lda_util_conorm_data	(	text	data_table,
		text	vocab_table,
		text	output_data_table,
		text	output_vocab_table
	)

Parameters

data_table	The data table to be normalized
vocab_table	The vocabulary table to be normalized
output_data_table	The normalized data table
output_vocab_table	The normalized vocabulary table

◆ __lda_util_index_sort()

int4 [] __lda_util_index_sort ( float8 [] arr )

Parameters

arr	The array to be sorted

Returns: The index of elements

◆ __lda_util_norm_dataset()

set<lda_result> __lda_util_norm_dataset	(	text	data_table,
		text	norm_vocab_table,
		text	output_data_table
	)

Parameters

data_table	The data table to be normalized
vocab_table	The normalized vocabulary table
output_data_table	The normalized data table

◆ __lda_util_norm_vocab()

set<lda_result> __lda_util_norm_vocab	(	text	vocab_table,
		text	output_vocab_table
	)

Parameters

vocab_table	The vocabulary table in the form of <wordid::int4, word::text>
output_vocab_table	The regularized vocabulary table

◆ __lda_util_norm_with_smoothing()

float8 [] __lda_util_norm_with_smoothing	(	float8 []	arr,
		float8	smooth
	)

Parameters

arr	The array to be normalized
smooth	The smoothing parameter

Returns: The normalized vector

◆ __lda_util_transpose()

int8 [][] __lda_util_transpose ( int8 matrix[][] )

Parameters

matrix	The input 2-D array
The	transposed array

◆ __lda_util_unnest()

set<int4[]> __lda_util_unnest	(	int8 []	arr,
		int4	r,
		int4	c
	)

Parameters

arr	The 2-D array to be unnested

Returns: The unnested 1-D arrays

◆ __lda_util_unnest_transpose()

set<int4[]> __lda_util_unnest_transpose	(	int8 []	arr,
		int4	r,
		int4	c
	)

◆ lda_get_perplexity()

float8 lda_get_perplexity	(	text	model_table,
		text	output_data_table
	)

Parameters

model_table	The model table generated by lda_train
output_data_table	The output data table generated by lda_predict

Returns: The perplexity

◆ lda_get_topic_desc()

set<lda_result> lda_get_topic_desc	(	text	model_table,
		text	vocab_table,
		text	desc_table,
		int4	top_k
	)

Parameters

model_table	The model table generated by the training process
vocab_table	The vocabulary table (<wordid, word>)
top_k	The number of top words for each topic description
desc_table	The output table for storing the per-topic description

◆ lda_get_topic_word_count()

set<lda_result> lda_get_topic_word_count	(	text	model_table,
		text	output_table
	)

Parameters

model_table	The model table generated by the training process
output_table	The output table storing the per-topic word counts

◆ lda_get_word_topic_count()

set<lda_result> lda_get_word_topic_count	(	text	model_table,
		text	output_table
	)

Parameters

model_table	The model table generated by the training process
dist_table	The output table storing the per-word topic counts

◆ lda_get_word_topic_mapping()

set<lda_result> lda_get_word_topic_mapping	(	text	lda_output_table,
		text	mapping_table
	)

Parameters

lda_output_table	The output table from lda traning or predicting
mapping_table	The result table that saves the mapping info

◆ lda_parse_model()

_pivotalr_lda_model lda_parse_model	(	bigint []	lda_model,
		integer	voc_size,
		integer	topic_num
	)

◆ lda_predict() [1/2]

set<lda_result> lda_predict	(	text	data_table,
		text	model_table,
		text	output_table
	)

Parameters

data_table	Table storing the testing dataset, each row is in the form of <docid, wordid, count> where docid, wordid, and count are all non-negative integers.
model_table	Table storing the learned models
output_table	Table storing per-document topic counts and topic assignments

Note: default iter_num = 20

◆ lda_predict() [2/2]

set<lda_result> lda_predict	(	text	data_table,
		text	model_table,
		text	output_table,
		int4	iter_num
	)

◆ lda_train() [1/3]

set<lda_result> lda_train	(	text	data_table,
		text	model_table,
		text	output_data_table,
		int4	voc_size,
		int4	topic_num,
		int4	iter_num,
		float8	alpha,
		float8	beta
	)

Parameters

data_table	Table storing the training dataset, each row is in the form of <docid, wordid, count> where docid, wordid, and count are all non-negative integers.
model_table	Table storing the learned models (voc_size, topic_num, alpha, beta, per-word topic counts, and corpus-level topic counts)
output_data_table	Table storing the output data table in the form of <docid, wordcount, words, counts, topic_count, topic_assignment>
voc_size	Size of the vocabulary (Note that the wordid should be continous integers starting from 0 to voc_size - A data validation rountine will be called to validate the dataset.)
topic_num	Number of topics (e.g. 100)
iter_num	Number of iterations (e.g. 60)
alpha	Dirichlet parameter for the per-doc topic multinomial (e.g. 50/topic_num)
beta	Dirichlet parameter for the per-topic word multinomial (e.g. 0.01)

◆ lda_train() [2/3]

set<lda_result> lda_train	(	text	data_table,
		text	model_table,
		text	output_data_table,
		int4	voc_size,
		int4	topic_num,
		int4	iter_num,
		float8	alpha,
		float8	beta,
		int4	evaluate_every,
		float8	perplexity_tol
	)

◆ lda_train() [3/3]

set<lda_result> lda_train	(	text	data_table,
		text	model_table,
		text	output_data_table,
		int4	voc_size,
		int4	topic_num,
		int4	iter_num,
		float8	alpha,
		float8	beta,
		int4	evaluate_every
	)

Functions

Detailed Description

Function Documentation

◆ __lda_check_count_ceiling()

◆ __lda_count_topic_agg()

◆ __lda_count_topic_prefunc()

◆ __lda_count_topic_sfunc()

◆ __lda_gibbs_sample()

◆ __lda_perplexity_agg()

◆ __lda_perplexity_ffunc()

◆ __lda_perplexity_prefunc()

◆ __lda_perplexity_sfunc()

◆ __lda_random_assign()

◆ __lda_util_conorm_data()

◆ __lda_util_index_sort()

◆ __lda_util_norm_dataset()

◆ __lda_util_norm_vocab()

◆ __lda_util_norm_with_smoothing()

◆ __lda_util_transpose()

◆ __lda_util_unnest()

◆ __lda_util_unnest_transpose()

◆ lda_get_perplexity()

◆ lda_get_topic_desc()

◆ lda_get_topic_word_count()

◆ lda_get_word_topic_count()

◆ lda_get_word_topic_mapping()

◆ lda_parse_model()

◆ lda_predict() [1/2]

◆ lda_predict() [2/2]

◆ lda_train() [1/3]

◆ lda_train() [2/3]

◆ lda_train() [3/3]