223 lines
No EOL
6.6 KiB
C
223 lines
No EOL
6.6 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include "preprocessing.h"
|
|
|
|
Feature *init_feature(float value)
|
|
{
|
|
Feature *feature = (Feature*)malloc(sizeof(Feature));
|
|
feature->value = value;
|
|
feature->next_feature = NULL;
|
|
|
|
return feature;
|
|
}
|
|
|
|
OneHotLabel *init_onehotlabel(float value)
|
|
{
|
|
OneHotLabel *hot = (OneHotLabel*)malloc(sizeof(OneHotLabel));
|
|
hot->value = value;
|
|
hot->next = NULL;
|
|
|
|
return hot;
|
|
}
|
|
|
|
Sample *init_sample()
|
|
{
|
|
Sample *sample = (Sample*)malloc(sizeof(Sample));
|
|
sample->first_feature = NULL;
|
|
sample->first_hot = NULL;
|
|
sample->next_sample = NULL;
|
|
sample->t = TRAIN;
|
|
|
|
return sample;
|
|
}
|
|
|
|
Data *init_data()
|
|
{
|
|
Data *data = (Data*)malloc(sizeof(Data));
|
|
data->first_sample = NULL;
|
|
data->last_sample = NULL;
|
|
data->size = 0;
|
|
|
|
return data;
|
|
}
|
|
|
|
void add_sample_to_data(Data *data, Sample *sample)
|
|
{
|
|
if(data->size == 0)
|
|
{
|
|
data->first_sample = sample;
|
|
data->last_sample = sample;
|
|
data->size++;
|
|
}else
|
|
{
|
|
data->last_sample->next_sample = sample;
|
|
data->last_sample = sample;
|
|
data->size++;
|
|
}
|
|
}
|
|
|
|
void destroy_sample(Sample *sample)
|
|
{
|
|
Feature *temp1;
|
|
while (sample->first_feature != NULL)
|
|
{
|
|
temp1 = sample->first_feature;
|
|
sample->first_feature = sample->first_feature->next_feature;
|
|
free(temp1);
|
|
}
|
|
OneHotLabel *temp2;
|
|
while (sample->first_hot != NULL)
|
|
{
|
|
temp2 = sample->first_hot;
|
|
sample->first_hot = sample->first_hot->next;
|
|
free(temp2);
|
|
}
|
|
free(sample);
|
|
}
|
|
|
|
void destroy_data(Data *data)
|
|
{
|
|
Sample *temp;
|
|
while (data->first_sample != NULL)
|
|
{
|
|
temp = data->first_sample;
|
|
data->first_sample = data->first_sample->next_sample;
|
|
destroy_sample(temp);
|
|
}
|
|
free(data);
|
|
}
|
|
|
|
Data *csv_to_samples(char *path_to_csv, int n_features, char *features_separator, float train_percent, char *apply_onthot_encode, int n_classes)
|
|
{
|
|
Data *data = init_data();
|
|
FILE *file = fopen(path_to_csv, "r");
|
|
char line[100], *dup;
|
|
char *token;
|
|
int features_count;
|
|
if(file != NULL)
|
|
{
|
|
while(fgets(line, 100, file) != NULL)
|
|
{
|
|
Sample *current_sample = init_sample();
|
|
Feature *temp_last_feature;
|
|
features_count = n_features;
|
|
dup = strtok(line, "\n"); //extracting line content without '\n'
|
|
token = strtok(dup, features_separator);
|
|
while(token != NULL)
|
|
{
|
|
if(features_count > 0) //first put features into current sample
|
|
{
|
|
if(current_sample->first_feature == NULL)
|
|
{
|
|
current_sample->first_feature = init_feature( atof(token) );
|
|
temp_last_feature = current_sample->first_feature;
|
|
}else
|
|
{
|
|
temp_last_feature->next_feature = init_feature( atof(token) );
|
|
temp_last_feature = temp_last_feature->next_feature;
|
|
}
|
|
features_count--;
|
|
}else //put label into current sample
|
|
{
|
|
if(strcmp(apply_onthot_encode, "yes") == 0)
|
|
{
|
|
OneHotLabel *temp_last_hotlabel;
|
|
int i;
|
|
for(i=0 ; i<n_classes ; i++)
|
|
{
|
|
if(current_sample->first_hot == NULL)
|
|
{
|
|
current_sample->first_hot = init_onehotlabel(0.0);
|
|
temp_last_hotlabel = current_sample->first_hot;
|
|
}else
|
|
{
|
|
temp_last_hotlabel->next = init_onehotlabel(0.0);
|
|
temp_last_hotlabel = temp_last_hotlabel->next;
|
|
}
|
|
if((float)i == atof(token) )
|
|
{
|
|
temp_last_hotlabel->value = 1.0;
|
|
}
|
|
}
|
|
}else //when not applying one hot encoding, first_hot is directly equal to label
|
|
{
|
|
current_sample->first_hot = init_onehotlabel( atof(token) );
|
|
}
|
|
}
|
|
token = strtok(NULL, features_separator);
|
|
}
|
|
add_sample_to_data(data, current_sample);
|
|
}
|
|
Sample *current_sample = data->first_sample;
|
|
int number_of_train_samples = ((float)data->size*train_percent) / 100.0; //calculate number of training samples based on provided percentage
|
|
while(number_of_train_samples > 0)
|
|
{
|
|
current_sample = current_sample->next_sample;
|
|
number_of_train_samples--;
|
|
}
|
|
while(current_sample != NULL)
|
|
{
|
|
current_sample->t = TEST;
|
|
current_sample = current_sample->next_sample;
|
|
}
|
|
fclose(file);
|
|
}else
|
|
{
|
|
printf("Unable to open the file\n");
|
|
exit(-1);
|
|
}
|
|
return data;
|
|
}
|
|
|
|
void print_data(const Data *data)
|
|
{
|
|
Sample *current_sample = data->first_sample;
|
|
Feature *temp_feature;
|
|
OneHotLabel *temp_hotlabel;
|
|
printf("#=============================================#\n");
|
|
if(current_sample != NULL)
|
|
{
|
|
int count = 1;
|
|
while(current_sample != NULL)
|
|
{
|
|
printf("Sample %d : ", count);
|
|
if(current_sample->t == TRAIN)
|
|
{
|
|
printf("TR\n");
|
|
}else if(current_sample->t == TEST)
|
|
{
|
|
printf("TE\n");
|
|
}else
|
|
{
|
|
printf("VA\n");
|
|
}
|
|
temp_feature = current_sample->first_feature;
|
|
while(temp_feature != NULL)
|
|
{
|
|
printf("%f ", temp_feature->value);
|
|
temp_feature = temp_feature->next_feature;
|
|
}
|
|
temp_hotlabel = current_sample->first_hot;
|
|
printf(">##> ");
|
|
while(temp_hotlabel != NULL)
|
|
{
|
|
printf("%f ", temp_hotlabel->value);
|
|
temp_hotlabel = temp_hotlabel->next;
|
|
}
|
|
current_sample = current_sample->next_sample;
|
|
if(current_sample != NULL)
|
|
{
|
|
printf("\n#---------------------------------------------#\n");
|
|
}else
|
|
{
|
|
printf("\n");
|
|
}
|
|
count++;
|
|
}
|
|
}else
|
|
{
|
|
printf("Nothing to print : empty data !!!\n");
|
|
}
|
|
printf("#=============================================#\n");
|
|
} |