cbasedann/preprocessing.c

223 lines
No EOL
6.6 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "preprocessing.h"
Feature *init_feature(float value)
{
Feature *feature = (Feature*)malloc(sizeof(Feature));
feature->value = value;
feature->next_feature = NULL;
return feature;
}
OneHotLabel *init_onehotlabel(float value)
{
OneHotLabel *hot = (OneHotLabel*)malloc(sizeof(OneHotLabel));
hot->value = value;
hot->next = NULL;
return hot;
}
Sample *init_sample()
{
Sample *sample = (Sample*)malloc(sizeof(Sample));
sample->first_feature = NULL;
sample->first_hot = NULL;
sample->next_sample = NULL;
sample->t = TRAIN;
return sample;
}
Data *init_data()
{
Data *data = (Data*)malloc(sizeof(Data));
data->first_sample = NULL;
data->last_sample = NULL;
data->size = 0;
return data;
}
void add_sample_to_data(Data *data, Sample *sample)
{
if(data->size == 0)
{
data->first_sample = sample;
data->last_sample = sample;
data->size++;
}else
{
data->last_sample->next_sample = sample;
data->last_sample = sample;
data->size++;
}
}
void destroy_sample(Sample *sample)
{
Feature *temp1;
while (sample->first_feature != NULL)
{
temp1 = sample->first_feature;
sample->first_feature = sample->first_feature->next_feature;
free(temp1);
}
OneHotLabel *temp2;
while (sample->first_hot != NULL)
{
temp2 = sample->first_hot;
sample->first_hot = sample->first_hot->next;
free(temp2);
}
free(sample);
}
void destroy_data(Data *data)
{
Sample *temp;
while (data->first_sample != NULL)
{
temp = data->first_sample;
data->first_sample = data->first_sample->next_sample;
destroy_sample(temp);
}
free(data);
}
Data *csv_to_samples(char *path_to_csv, int n_features, char *features_separator, float train_percent, char *apply_onthot_encode, int n_classes)
{
Data *data = init_data();
FILE *file = fopen(path_to_csv, "r");
char line[100], *dup;
char *token;
int features_count;
if(file != NULL)
{
while(fgets(line, 100, file) != NULL)
{
Sample *current_sample = init_sample();
Feature *temp_last_feature;
features_count = n_features;
dup = strtok(line, "\n"); //extracting line content without '\n'
token = strtok(dup, features_separator);
while(token != NULL)
{
if(features_count > 0) //first put features into current sample
{
if(current_sample->first_feature == NULL)
{
current_sample->first_feature = init_feature( atof(token) );
temp_last_feature = current_sample->first_feature;
}else
{
temp_last_feature->next_feature = init_feature( atof(token) );
temp_last_feature = temp_last_feature->next_feature;
}
features_count--;
}else //put label into current sample
{
if(strcmp(apply_onthot_encode, "yes") == 0)
{
OneHotLabel *temp_last_hotlabel;
int i;
for(i=0 ; i<n_classes ; i++)
{
if(current_sample->first_hot == NULL)
{
current_sample->first_hot = init_onehotlabel(0.0);
temp_last_hotlabel = current_sample->first_hot;
}else
{
temp_last_hotlabel->next = init_onehotlabel(0.0);
temp_last_hotlabel = temp_last_hotlabel->next;
}
if((float)i == atof(token) )
{
temp_last_hotlabel->value = 1.0;
}
}
}else //when not applying one hot encoding, first_hot is directly equal to label
{
current_sample->first_hot = init_onehotlabel( atof(token) );
}
}
token = strtok(NULL, features_separator);
}
add_sample_to_data(data, current_sample);
}
Sample *current_sample = data->first_sample;
int number_of_train_samples = ((float)data->size*train_percent) / 100.0; //calculate number of training samples based on provided percentage
while(number_of_train_samples > 0)
{
current_sample = current_sample->next_sample;
number_of_train_samples--;
}
while(current_sample != NULL)
{
current_sample->t = TEST;
current_sample = current_sample->next_sample;
}
fclose(file);
}else
{
printf("Unable to open the file\n");
exit(-1);
}
return data;
}
void print_data(const Data *data)
{
Sample *current_sample = data->first_sample;
Feature *temp_feature;
OneHotLabel *temp_hotlabel;
printf("#=============================================#\n");
if(current_sample != NULL)
{
int count = 1;
while(current_sample != NULL)
{
printf("Sample %d : ", count);
if(current_sample->t == TRAIN)
{
printf("TR\n");
}else if(current_sample->t == TEST)
{
printf("TE\n");
}else
{
printf("VA\n");
}
temp_feature = current_sample->first_feature;
while(temp_feature != NULL)
{
printf("%f ", temp_feature->value);
temp_feature = temp_feature->next_feature;
}
temp_hotlabel = current_sample->first_hot;
printf(">##> ");
while(temp_hotlabel != NULL)
{
printf("%f ", temp_hotlabel->value);
temp_hotlabel = temp_hotlabel->next;
}
current_sample = current_sample->next_sample;
if(current_sample != NULL)
{
printf("\n#---------------------------------------------#\n");
}else
{
printf("\n");
}
count++;
}
}else
{
printf("Nothing to print : empty data !!!\n");
}
printf("#=============================================#\n");
}