#include #include #include #include "preprocessing.h" Feature *init_feature(float value) { Feature *feature = (Feature*)malloc(sizeof(Feature)); feature->value = value; feature->next_feature = NULL; return feature; } OneHotLabel *init_onehotlabel(float value) { OneHotLabel *hot = (OneHotLabel*)malloc(sizeof(OneHotLabel)); hot->value = value; hot->next = NULL; return hot; } Sample *init_sample() { Sample *sample = (Sample*)malloc(sizeof(Sample)); sample->first_feature = NULL; sample->first_hot = NULL; sample->next_sample = NULL; sample->t = TRAIN; return sample; } Data *init_data() { Data *data = (Data*)malloc(sizeof(Data)); data->first_sample = NULL; data->last_sample = NULL; data->size = 0; return data; } void add_sample_to_data(Data *data, Sample *sample) { if(data->size == 0) { data->first_sample = sample; data->last_sample = sample; data->size++; }else { data->last_sample->next_sample = sample; data->last_sample = sample; data->size++; } } void destroy_sample(Sample *sample) { Feature *temp1; while (sample->first_feature != NULL) { temp1 = sample->first_feature; sample->first_feature = sample->first_feature->next_feature; free(temp1); } OneHotLabel *temp2; while (sample->first_hot != NULL) { temp2 = sample->first_hot; sample->first_hot = sample->first_hot->next; free(temp2); } free(sample); } void destroy_data(Data *data) { Sample *temp; while (data->first_sample != NULL) { temp = data->first_sample; data->first_sample = data->first_sample->next_sample; destroy_sample(temp); } free(data); } Data *csv_to_samples(char *path_to_csv, int n_features, char *features_separator, float train_percent, char *apply_onthot_encode, int n_classes) { Data *data = init_data(); FILE *file = fopen(path_to_csv, "r"); char line[100], *dup; char *token; int features_count; if(file != NULL) { while(fgets(line, 100, file) != NULL) { Sample *current_sample = init_sample(); Feature *temp_last_feature; features_count = n_features; dup = strtok(line, "\n"); //extracting line content without '\n' token = strtok(dup, features_separator); while(token != NULL) { if(features_count > 0) //first put features into current sample { if(current_sample->first_feature == NULL) { current_sample->first_feature = init_feature( atof(token) ); temp_last_feature = current_sample->first_feature; }else { temp_last_feature->next_feature = init_feature( atof(token) ); temp_last_feature = temp_last_feature->next_feature; } features_count--; }else //put label into current sample { if(strcmp(apply_onthot_encode, "yes") == 0) { OneHotLabel *temp_last_hotlabel; int i; for(i=0 ; ifirst_hot == NULL) { current_sample->first_hot = init_onehotlabel(0.0); temp_last_hotlabel = current_sample->first_hot; }else { temp_last_hotlabel->next = init_onehotlabel(0.0); temp_last_hotlabel = temp_last_hotlabel->next; } if((float)i == atof(token) ) { temp_last_hotlabel->value = 1.0; } } }else //when not applying one hot encoding, first_hot is directly equal to label { current_sample->first_hot = init_onehotlabel( atof(token) ); } } token = strtok(NULL, features_separator); } add_sample_to_data(data, current_sample); } Sample *current_sample = data->first_sample; int number_of_train_samples = ((float)data->size*train_percent) / 100.0; //calculate number of training samples based on provided percentage while(number_of_train_samples > 0) { current_sample = current_sample->next_sample; number_of_train_samples--; } while(current_sample != NULL) { current_sample->t = TEST; current_sample = current_sample->next_sample; } fclose(file); }else { printf("Unable to open the file\n"); exit(-1); } return data; } void print_data(const Data *data) { Sample *current_sample = data->first_sample; Feature *temp_feature; OneHotLabel *temp_hotlabel; printf("#=============================================#\n"); if(current_sample != NULL) { int count = 1; while(current_sample != NULL) { printf("Sample %d : ", count); if(current_sample->t == TRAIN) { printf("TR\n"); }else if(current_sample->t == TEST) { printf("TE\n"); }else { printf("VA\n"); } temp_feature = current_sample->first_feature; while(temp_feature != NULL) { printf("%f ", temp_feature->value); temp_feature = temp_feature->next_feature; } temp_hotlabel = current_sample->first_hot; printf(">##> "); while(temp_hotlabel != NULL) { printf("%f ", temp_hotlabel->value); temp_hotlabel = temp_hotlabel->next; } current_sample = current_sample->next_sample; if(current_sample != NULL) { printf("\n#---------------------------------------------#\n"); }else { printf("\n"); } count++; } }else { printf("Nothing to print : empty data !!!\n"); } printf("#=============================================#\n"); }