projet-analyse-exploratoire/tp-analyse-explorative-master/tp-analyse-explorative/exo2.R

library(tidyverse)
library(directlabels)

####### 1 Plot and Given names #######
### Exploring the Pink City
# read the table prenoms.csv
prenoms <- read.csv('/home/labourde/Bureau/tp-analyse-explorative/prenoms.csv',sep=";")
# Inspect it
head(prenoms)
# Plot the number of births by year
prenoms %>%
  group_by(Année) %>%
  summarise(Naissances=sum(Nombre)) %>%
  ggplot(aes(x=Année,y=Naissances)) +
  geom_point()
# Plot the number of male/female births by year
prenoms %>%
  group_by(Année,Sexe) %>%
  summarise(Naissances=sum(Nombre)) %>%
  ggplot(aes(x=Année,y=Naissances)) +
  geom_point(aes(color=Sexe))
# Is my name in the dataset ?
prenoms[tolower(prenoms$Prénom) == tolower("Titouan"),]
# Represent the 10 most given names
prenoms %>%
  group_by(Prénom) %>%
  summarise(Naissances=sum(Nombre)) %>%
  arrange(desc(Naissances)) -> mostGivenNames
ggplot(head(mostGivenNames,10),aes(x=Prénom,y=Naissances)) +
  geom_bar(stat='Identity',aes(fill=Prénom))
# Select for each year the top 5 given names by sex and their evolution through the years
prenoms[prenoms$Sexe == "M",] %>%
  group_by(Année) %>%
  slice_max(order_by = Nombre, n = 5) -> mostMaleGivenNamesByYear
prenoms[prenoms$Sexe == "F",] %>%
  group_by(Année) %>%
  slice_max(order_by = Nombre, n = 5) -> mostFemaleGivenNamesByYear
mostGivenNamesBySexeAndYear <- rbind(mostMaleGivenNamesByYear, mostFemaleGivenNamesByYear)
ggplot(mostGivenNamesBySexeAndYear,aes(x=Année,y=Nombre)) +
  geom_point(aes(color=Prénom)) +
  facet_wrap("Prénom")
# Plot the average numbers of letters by years
prenoms %>%
  group_by(Année) %>%
  summarise(Lettres=mean(nchar(Prénom))) -> averageNumberOfLettersByYear
ggplot(averageNumberOfLettersByYear,aes(x=Année,y=Lettres)) +
  geom_line()
# Plot the average numbers of vowels/consonants by years
prenoms %>%
  group_by(Année) %>%
  summarise(
    Vowels = mean(nchar(gsub("[éèêëàâäïaeiouy]", "", tolower(Prénom)))),
    Consonants = mean(nchar(gsub("[zrtpqsdfghjklmwxcvbnç]", "", tolower(Prénom))))
    ) -> averageNumberOfVowelsAndConsonantsByYear
ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Vowels)) +
  geom_point()
ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Consonants)) +
  geom_point()
# How the number of composed names change through the years
prenoms[str_detect(prenoms$Prénom, regex("[a-zA-Zéèàï]+-[a-zA-ZéèàÎ]+")),] %>%
  group_by(Année) %>%
  summarise(nb_composedNames=sum(Nombre)) -> composedNamesByYear
ggplot(composedNamesByYear,aes(x=Année,y=nb_composedNames)) +
  geom_line()
# Defining a "hype" criteria and finding the hypest names
prenoms %>%
  group_by(Année) -> namesByYear
namesByYear$nbAnnéePrec <- lag(namesByYear$Nombre)
namesByYear$difference <- namesByYear$Nombre - namesByYear$nbAnnéePrec

namesByYear %>%
  group_by(Année) %>%
  slice_max(order_by = difference, n = 1) -> hypestNameByYear
ggplot(hypestNameByYear,aes(x=factor(Année),y=difference)) +
  geom_text(aes(label=Prénom))

### Exploring the Gray City
# read the table prenomsParis.csv
prenomsParis <- read.csv('/home/labourde/Bureau/tp-analyse-explorative/prenomsParis.csv',sep=";")
prenomsParis <-rename(prenomsParis, Année = Annee)
prenomsParis <-rename(prenomsParis, Prénom = Prenoms)

nameStudy <- function(prenoms) {
  # Plot the number of births by year
  prenoms %>%
    group_by(Année) %>%
    summarise(Naissances=sum(Nombre)) %>%
    ggplot(aes(x=Année,y=Naissances)) +
    ggtitle("Naissances par année") +
    geom_point() -> p1
  # Plot the number of male/female births by year
  prenoms %>%
    group_by(Année,Sexe) %>%
    summarise(Naissances=sum(Nombre)) %>%
    ggplot(aes(x=Année,y=Naissances)) +
    ggtitle("Sexe des naissances par année") +
    geom_point(aes(color=Sexe)) -> p2
  # Is my name in the dataset ?
  prenoms[tolower(prenoms$Prénom) == tolower("Titouan"),]
  # Represent the 10 most given names
  prenoms %>%
    group_by(Prénom) %>%
    summarise(Naissances=sum(Nombre)) %>%
    arrange(desc(Naissances)) -> mostGivenNames
  ggplot(head(mostGivenNames,10),aes(x=reorder(Prénom,Naissances),y=Naissances)) +
    ggtitle("Les 10 noms les plus donnés") +
    geom_bar(stat='Identity',aes(fill=reorder(Prénom,Naissances))) -> p3
  # Select for each year the top 5 given names by sex and their evolution through the years
  prenoms[prenoms$Sexe == "M",] %>%
    group_by(Année) %>%
    slice_max(order_by = Nombre, n = 5) -> mostMaleGivenNamesByYear
  prenoms[prenoms$Sexe == "F",] %>%
    group_by(Année) %>%
    slice_max(order_by = Nombre, n = 5) -> mostFemaleGivenNamesByYear
  mostGivenNamesBySexeAndYear <- rbind(mostMaleGivenNamesByYear, mostFemaleGivenNamesByYear)
  ggplot(mostGivenNamesBySexeAndYear,aes(x=Année,y=Nombre)) +
    ggtitle("Top 5 des noms donnés, par sexe et par an") +
    geom_point(aes(color=Prénom)) +
    facet_wrap("Prénom") -> p4
  # Plot the average numbers of letters by years
  prenoms %>%
    group_by(Année) %>%
    summarise(Lettres=mean(nchar(Prénom))) -> averageNumberOfLettersByYear
  ggplot(averageNumberOfLettersByYear,aes(x=Année,y=Lettres)) +
    ggtitle("Nombre moyen de lettres dans les prénoms par année") +
    geom_line() -> p5
  # Plot the average numbers of vowels/consonants by years
  prenoms %>%
    group_by(Année) %>%
    summarise(
      Vowels = mean(nchar(gsub("[éèêëàâäïaeiouy]", "", tolower(Prénom)))),
      Consonants = mean(nchar(gsub("[zrtpqsdfghjklmwxcvbnç]", "", tolower(Prénom))))
    ) -> averageNumberOfVowelsAndConsonantsByYear
  ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Vowels)) +
    ggtitle("Nombre moyen de voyelles dans les prénoms par année") +
    geom_point() -> p6
  ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Consonants)) +
    ggtitle("Nombre moyen de consonnes dans les prénoms par année") +
    geom_point() -> p7
  # How the number of composed names change through the years
  prenoms[str_detect(prenoms$Prénom, regex("[a-zA-Zéèàï]+-[a-zA-ZéèàÎ]+")),] %>%
    group_by(Année) %>%
    summarise(nb_composedNames=sum(Nombre)) -> composedNamesByYear
  ggplot(composedNamesByYear,aes(x=Année,y=nb_composedNames)) +
    ggtitle("Nombre de prénoms composés par année") +
    geom_line() -> p8
  # Defining a "hype" criteria and finding the hypest names
  prenoms %>%
    group_by(Année) -> namesByYear
  namesByYear$nbAnnéePrec <- lag(namesByYear$Nombre)
  namesByYear$difference <- namesByYear$Nombre - namesByYear$nbAnnéePrec
  namesByYear %>%
    group_by(Année) %>%
    slice_max(order_by = difference, n = 1) -> hypestNameByYear
  ggplot(hypestNameByYear,aes(x=factor(Année),y=difference)) +
    ggtitle("Prénom le plus hype par année") +
    geom_text(aes(label=Prénom)) -> p9
  return(list(p1,p2,p3,p4,p5,p6,p7,p8,p9))
}
plotsParis <- nameStudy(prenomsParis)
plotsParis

# A tale of two (or more) cities

prenoms <- subset( prenoms, select = -Ordre )
prenoms$Ville <- "Toulouse"
prenomsParis$Ville <- "Paris"
allPrenoms <- rbind(prenoms,prenomsParis)
allPrenoms %>%
  group_by(Année,Ville) %>%
  mutate(NaissancesVilleAnnée=sum(Nombre)) -> allPrenoms

nameStudyCombined <- function(prenoms) {
  # Plot the number of births by year
  prenoms %>%
    group_by(Année,Ville) %>%
    summarise(Naissances=sum(Nombre)) %>%
    ggplot(aes(x=Année,y=Naissances)) +
    geom_point(aes(color=Ville)) +
    geom_line(aes(color=Ville)) +
    ggtitle("Naissances par année") -> p1

  prenoms %>%
    group_by(Année,Ville) %>%
    mutate(NaissancesVilleAnnée=sum(Nombre)) -> prenoms

  # Plot the number of male/female births by year and by cities
  prenoms %>%
    group_by(Année,Sexe,Ville) %>%
    summarise(Naissances=100*sum(Nombre)/NaissancesVilleAnnée) %>%
    ggplot(aes(x=Année,y=Naissances)) +
    geom_point(aes(color=Sexe)) +
    facet_wrap("Ville") +
    ggtitle("Sexe des naissances par année en %") -> p2

  # Represent the 10 most given names by cities
  prenoms %>%
    group_by(Ville) %>%
    mutate(NaissancesVille = sum(Nombre)) %>%
    group_by(Prénom,Ville) %>%
    summarise(Naissances=100*sum(Nombre)/sum(NaissancesVille)) %>%
    group_by(Ville) %>%
    slice_max(order_by = Naissances, n = 10) -> mostGivenNames
  ggplot(mostGivenNames,aes(x=reorder(Prénom,Naissances),y=Naissances)) +
    ggtitle("Les 10 noms les plus donnés par ville") +
    geom_bar(stat='Identity',aes(fill=reorder(Prénom,Naissances))) +
    facet_wrap("Ville") -> p3

  # Select for each year the top 5 given names by sex and their evolution through the years by cites
  tryCatch(
    {
      prenoms[prenoms$Sexe == "M",] %>%
        group_by(Ville,Année) %>%
        slice_max(order_by = Nombre, n = 5) -> mostMaleGivenNamesByYear
      prenoms[prenoms$Sexe == "F",] %>%
        group_by(Ville,Année) %>%
        slice_max(order_by = Nombre, n = 5) -> mostFemaleGivenNamesByYear
      mostGivenNamesBySexeAndYear <- rbind(mostMaleGivenNamesByYear, mostFemaleGivenNamesByYear)
      mostGivenNamesBySexeAndYear %>%
        group_by(Prénom,Ville,Année) %>%
        mutate(Naissances=100*sum(Nombre)/sum(NaissancesVilleAnnée)) -> mostGivenNamesBySexeAndYearNormalized
      ggplot(mostGivenNamesBySexeAndYearNormalized,aes(x=Année,y=Naissances)) +
        ggtitle("Top 5 des noms donnés, par sexe, par an et par ville") +
        geom_line(aes(color=Prénom)) +
        geom_point(aes(color=Prénom,shape=Sexe),size=3) +
        geom_dl(aes(label=Prénom), method=list(dl.combine("first.points")), cex = 0.8) +
        facet_wrap("Ville")
    },
    error=function(e) e
  ) -> p4


  # Plot the average numbers of letters by years by cities
  prenoms %>%
    group_by(Année,Ville) %>%
    summarise(Lettres=mean(nchar(Prénom))) -> averageNumberOfLettersByYear
  ggplot(averageNumberOfLettersByYear,aes(x=Année,y=Lettres)) +
    ggtitle("Nombre moyen de lettres dans les prénoms par année et par ville") +
    geom_point(aes(color=Ville)) +
    geom_line(aes(color=Ville)) -> p5

  # Plot the average numbers of vowels/consonants by years and by cities
  prenoms %>%
    group_by(Ville, Année) %>%
    summarise(
      Vowels = mean(nchar(gsub("[éèêëàâäïaeiouy]", "", tolower(Prénom)))),
      Consonants = mean(nchar(gsub("[zrtpqsdfghjklmwxcvbnç]", "", tolower(Prénom))))
    ) -> averageNumberOfVowelsAndConsonantsByYear
  ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Vowels)) +
    ggtitle("Nombre moyen de voyelles dans les prénoms par année et par ville") +
    geom_point(aes(color=Ville)) +
    geom_line(aes(color=Ville)) -> p6
  ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Consonants)) +
    ggtitle("Nombre moyen de consonnes dans les prénoms par année et par ville") +
    geom_point(aes(color=Ville)) +
    geom_line(aes(color=Ville)) -> p7

  # How the number of composed names change through the years
  prenoms[str_detect(prenoms$Prénom, regex("[a-zA-Zéèàï]+-[a-zA-ZéèàÎ]+")),] %>%
    group_by(Année,Ville) %>%
    summarise(nb_composedNames=100*sum(Nombre)/NaissancesVilleAnnée) -> composedNamesByYear
  ggplot(composedNamesByYear,aes(x=Année,y=nb_composedNames)) +
    ggtitle("Nombre de prénoms composés par année et par ville") +
    geom_point(aes(color=Ville)) +
    geom_line(aes(color=Ville)) -> p8

  # Defining a "hype" criteria and finding the hypest names
  prenoms %>%
    group_by(Année,Ville) -> namesByYearAndCities
  namesByYearAndCities$nbAnnéePrec <- lag(namesByYearAndCities$Nombre)
  namesByYearAndCities$difference <- namesByYearAndCities$Nombre - namesByYearAndCities$nbAnnéePrec
  namesByYearAndCities %>%
    group_by(Année,Ville) %>%
    slice_max(order_by = difference, n = 1) -> hypestNameByYearAndCities
  ggplot(hypestNameByYearAndCities,aes(x=factor(Année),y=difference/NaissancesVilleAnnée)) +
    ggtitle("Prénom le plus hype par année et par ville") +
    geom_text(aes(label=Prénom,color=Prénom)) +
    facet_wrap("Ville") -> p9

  return(list(p1,p2,p3,p4,p5,p6,p7,p8,p9))
}

combinedParisToulouse <- nameStudyCombined(allPrenoms)
combinedParisToulouse

# The most unshared names
allPrenoms %>%
  group_by(Ville) %>%
  mutate(NaissancesVille=sum(Nombre)) %>%
  group_by(Prénom,Ville) %>%
  mutate(Proportion=sum(Nombre)/NaissancesVille) %>%
  group_by(Prénom,Ville) -> namesByCities
namesByCities$autreVilleProportion <- lag(namesByCities$Proportion)
namesByCities$difference <- namesByCities$Proportion - namesByCities$autreVilleProportion


### A Tale of many cities
# read the table prenomsRennesStrassNantesToul.csv
prenomsRennesStrassNantesToul <- read.csv('/home/labourde/Bureau/tp-analyse-explorative/prenomsRennesStrassNantesToul.csv',sep=";")
prenomsRennesStrassNantesToul <-rename(prenomsRennesStrassNantesToul, Année = ANNAISS)
prenomsRennesStrassNantesToul <-rename(prenomsRennesStrassNantesToul, Ville = LBCOM)
prenomsRennesStrassNantesToul <-rename(prenomsRennesStrassNantesToul, Sexe = SEX)
prenomsRennesStrassNantesToul <-rename(prenomsRennesStrassNantesToul, Prénom = PRN)
prenomsRennesStrassNantesToul <-rename(prenomsRennesStrassNantesToul, Nombre = NRB)
prenomsRennesStrassNantesToul[prenomsRennesStrassNantesToul$Ville=="RENNES",] -> a
combinedRennesStrassNantesToul <- nameStudyCombined(prenomsRennesStrassNantesToul)
combinedRennesStrassNantesToul[1]
combinedRennesStrassNantesToul[2]
combinedRennesStrassNantesToul[3]
combinedRennesStrassNantesToul[4]
combinedRennesStrassNantesToul[5]
combinedRennesStrassNantesToul[6]
combinedRennesStrassNantesToul[7]
combinedRennesStrassNantesToul[8]
combinedRennesStrassNantesToul[9]