TDDE18CppCourse/lab5/main.cc

#include <iostream>
#include <fstream>
#include <iomanip>
#include <algorithm>
#include <iterator>
#include <vector>
#include <map>
#include <cctype>


/*
    Lab 5 - TDDE18
    Wordlists lab - Main file
*/

using namespace std;

//////////////////////////////////
/// User interaction functions ///
//////////////////////////////////

//TODO: Non fundamental types taken as parameters should not be copied
//as they might be large. Parameters which are not changed should also
//be const. --> done.

// Help function
void print_err(const string & str){
    cerr << "Error: " << str << endl;
    cerr << "Usage: ./a.out FILE [-a] [-f] [-o N]" << endl;
}

// Print final wordlist
/// For alphabetic ordering
void print_a(const std::vector<std::pair<string,int>> & wl, int max_length) {
    for_each(wl.begin(), wl.end(), [max_length](std::pair<string,int> & pair){
        cout << left << setw(max_length + 1)
            << pair.first << pair.second << endl;
    });
}

/// For frequency ordering
void print_f(const std::vector<std::pair<string,int>> & wl, int max_length) {
    for_each(wl.begin(), wl.end(), [max_length](std::pair<string,int> & pair){
        cout << setw(max_length) << pair.first << "  " << pair.second << endl;
    });
}

/// For -o argument
void print_o(const std::vector<string> & list, int limit) {
    string line{};
    for_each(list.begin(), list.end(), [&line, limit](string word){
        if (int(line.length() + word.length()) >= limit - 1){
            cout << line << endl;
            line = word;
        }
        else {
            if (line == ""){
                line =  word;
            }
            else {
                line += " " + word;
            }
        }
    });
    cout << line << endl;
}


/////////////////////////
/// Utility functions ///
/////////////////////////

//TODO: This function does too much. Divide the tasks into more
//functions which clearly state what they handle. For example:
//remove_trash, is_valid, etc. --> done.

//TODO: Enumerating the entire alphabet should be avoided since
//missing a character is easy. Use the library cctype for this
//instead. --> done.

// Check if a given word is valid
bool is_valid(const string & word) {
    return !(!std::all_of(word.begin(), word.end(), [](const char c){
                return isalpha(c) || c == '-';
            })
        || word.at(0) == '-' || word.at(word.length()-1) == '-'
        || int(word.length()) < 3
        || word.find("--") != std::string::npos);
}


bool cmp_length(const string & a, const string & b){
    return a.length() < b.length();
}

//Comment: Why unsigned long int? find_first.. will return an index in
//the string. This index will not be too large for a regular int to
//handle. --> Because it is the return type of these functions, else
//GCC will do a strange convertion and put a warning at compilation:
//
//main.cc:103:15: warning: comparison of integer expressions of different signedness:
//      ‘int’ and ‘const size_type’ {aka ‘const long unsigned int’} [-Wsign-compare]
//103 |     if (begin != std::string::npos && end != std::string::npos && begin < end){
//

string remove_trash(string & word){
    // Here we remove head and tail junk. Lot of the complexity
    // is dues to weird cases like "(?!)" or ")?!("
    const long unsigned int begin = word.find_first_not_of("(\"\'");
    const long unsigned int end =  word.find_last_not_of("!?;,:.\"\')");
    if (begin != std::string::npos && end != std::string::npos && begin < end){
        word.erase(0, begin);
        word.erase(word.find_last_not_of("!?;,:.\"\')") + 1);
    }
    // Manage the 's tail trash
    if (*(word.end()-2) == '\'' && *(word.end()-1) == 's')
        word.erase(word.end()-2, word.end());
    // Lowering case
    std::transform(word.begin(), word.end(), word.begin(), ::tolower);
    return word;
}


//TODO: See lab instructions for correct error message in case of
//invalid input. --> done. Note : In some cases, the PDF does not
//specify what should be displayed, in this case we have displayed
//the message that suits us the most suitable

int main(int argc, char* argv[]) {

    ////////////////////////////
    /// Arguments management ///
    ////////////////////////////

    /// Args parsing
    // Basic checks
    if (argc <= 1 ){
        print_err("No arguments given.");
        return 1;
    }
    else if (argc == 2){
        print_err("Second argument missing or invalid.");
        return 2;
    }

    // Checking file
    vector<string> args {argv, argv + argc};
    string file_name{args[1]};

    ifstream file{};
    file.open(file_name);

    if (! file.is_open()){
        print_err("Second argument missing or invalid.");
        return 3;
    }

    // Checking command
    string param{args[2][1]};
    int line_length{};

    if (args[2][0] != '-' || (param != "a" && param != "f" && param != "o")){
        print_err("Third argument invalid.");
        return 4;
    }
    else if (param == "o"){
        if (argc != 4) {
            print_err("You must specify a N value.");
            return 6;
        }
        try {
            line_length = stoi(args[3]);
        }
        catch (std::exception& e){
            print_err("Invalid N value. (" + string(e.what()) + ")");
            return 5;
        }
    }

    /////////////////////
    /// File analysis ///
    /////////////////////

    int max_length{0};
    istream_iterator<string> iis {file};
    istream_iterator<string> eof;
    std::vector<string> dirty_words{iis, eof};
    std::vector<string> all(dirty_words.size());
    std::map<string, int> collection;


    //TODO: Here you use for_each to emulate a for-loop. This is not
    //an appropriate algorithm for this task. What you want to do is
    //transform each element in dirty_words. Hint: Check out the
    //transform algorithm. See TODO about dividing the purifier
    //function before fixing this TODO. --> done.

    //TODO: This algorithm does too much. Break it down into more
    //appropriate algorithms. For example, use std::max_element to
    //find the largest element, etc. --> done.

    std::transform(dirty_words.begin(), dirty_words.end(), dirty_words.begin(), remove_trash);
    std::copy_if(dirty_words.begin(), dirty_words.end(), all.begin(), is_valid);
    max_length = (*std::max_element(all.begin(), all.end(), cmp_length)).length();

    // Counting word in a std::map
    std::for_each(all.begin(), all.end(), [&collection](string word) {
        if (word != "")
            collection[word]++;
    });


    // Converting map collection into vector wordlist to allow sorting
    std::vector<std::pair<string,int>> wordlist;
    for_each(collection.begin(), collection.end(), [&wordlist](auto pair){
        wordlist.push_back(pair);
    });


    // Output selection and sorting (if needed)
    if (param == "f"){
        std::sort(wordlist.begin(), wordlist.end(),
            [](std::pair<string,int> const & a, std::pair<string,int> const & b)
        {
            return a.second > b.second;
        });
        print_f(wordlist, max_length);
    }
    else if (param == "a"){
        std::sort(wordlist.begin(), wordlist.end(),
            [](std::pair<string,int> const & a, std::pair<string,int> const & b)
        {
            return a.first < b.first;
        });
        print_a(wordlist, max_length);
    }
    else {
        print_o(all, line_length);
    }


    return 0;
}