Questo commit è contenuto in:
m-gues 2021-12-08 18:31:26 +01:00
commit 2ed6382fd8
32 ha cambiato i file con 1659524 aggiunte e 0 eliminazioni

BIN
.RData File normale

File binario non mostrato.

136
.Rhistory File normale
Vedi file

@ -0,0 +1,136 @@
library("jsonlite")
data<-fromJSON(file="C:\\Users\\Marianne\\Desktop\\projet-analyse-exploratoire\\anime-offline-database-master\\anime-offline-database.json")
data<-fromJSON("C:\\Users\\Marianne\\Desktop\\projet-analyse-exploratoire\\anime-offline-database-master\\anime-offline-database.json")
dfAnimes <- as.data.frame(data)
library("tidyverse")
install.packages("tidyverse")
library("tidyverse")
dfAnimes
#Nettoyage des colonnes non utilisées
dfAnimes <- select(dfAnimes, data.title, data.type, data.episodes, data.status, data.animeSeason, data.tags)
#Nombre d'animes durant plus de 2 cours (24 épisodes) par ans
dfAnimes %>%
filter(data.episodes>25) %>% longbois
#Nombre d'animes durant plus de 2 cours (24 épisodes) par ans
dfAnimes %>%
filter(data.episodes>25) -> longbois
#Nombre d'animes durant plus de 2 cours (24 épisodes) par an
dfAnimes %>%
filter(data.episodes>30) -> longbois
#Nombre d'animes durant plus de 2 cours (estimés à 30 épisodes) par an
dfAnimes %>%
filter(data.episodes>30) %>%
group_by(data.animeSeason) %>%
count()-> longbois
#Nombre d'animes durant plus de 2 cours (estimés à 30 épisodes) par an
dfAnimes %>%
filter(data.episodes>30) %>%
group_by(data.animeSeason) %>%
count(nbAnimes)-> longbois
#Nombre d'animes durant plus de 2 cours (estimés à 30 épisodes) par an
dfAnimes %>%
filter(data.episodes>30) %>%
group_by(data.animeSeason) %>%
count() %>%
rename(n = nbAnimes) -> longbois
#Nombre d'animes durant plus de 2 cours (estimés à 30 épisodes) par an
dfAnimes %>%
filter(data.episodes>30) %>%
group_by(data.animeSeason) %>%
count() %>%
rename(nbAnimes = n) -> longbois
longbois %>% ggplot(aes(data, nbAnimes)) + ggcol()
longbois %>% ggplot(aes(data, nbAnimes)) + ggcols()
longbois %>% ggplot(aes(data, nbAnimes)) + geom_cols()
longbois %>% ggplot(aes(data, nbAnimes)) + geom_col()
longbois %>% ggplot(aes(data.animeSeason, nbAnimes)) + geom_col()
longbois %>% ggplot(aes(data.animeSeason$year, nbAnimes)) + geom_col()
#Evolution du nombre de lettre (moyenne et médiane) dans les titres par année
dfAnimes %>%
group_by(data.animeSeason$year) %>%
summarise(Lettres=mean(nchar(data.title))) -> meanLettersByYear
View(meanLettersByYear)
#Evolution du nombre de lettre (moyenne et médiane) dans les titres par année
dfAnimes %>%
group_by(data.animeSeason$year) %>%
summarise(lettres=mean(nchar(data.title))) -> meanLettersByYear
dfAnimes %>%
group_by(data.animeSeason$year) %>%
summarise(lettres=median(nchar(data.title))) -> medLettersByYear
meanLettersByYear %>% ggplot(aes(data.animeSeason$year, lettres)) + geom_col()
dfAnimes %>%
group_by(data.animeSeason$year) %>%
mutate(lettres=median(nchar(data.title))) -> medLettersByYear
dfAnimes %>%
group_by(data.animeSeason$year) %>%
summarise(lettres=median(nchar(data.title))) -> medLettersByYear
meanLettersByYear %>% ggplot(aes(data.animeSeason$year, lettres)) + geom_col()
dfAnimes %>%
group_by(data.animeSeason$year) %>%
summarise(lettres=median(nchar(data.title))) %>%
rename(annee = data.animeSeason$year)-> medLettersByYear
#Dedoublage de la colonne saison
dfAnimes %>%
mutate(annee = data.animeSeason$year) %>%
mutate(saison = data.animeSeason$season) -> dfAnimes
dfAnimes %>%
group_by(annee) %>%
summarise(lettres=median(nchar(data.title))) -> medLettersByYear
View(medLettersByYear)
View(dfAnimes)
View(dfAnimes)
#Evolution du nombre de lettre (moyenne et médiane) dans les titres par année
dfAnimes %>%
group_by(annee) %>%
summarise(lettres=mean(nchar(data.title))) -> meanLettersByYear
View(meanLettersByYear)
meanLettersByYear %>% ggplot(aes(data.animeSeason$year, lettres)) + geom_col()
meanLettersByYear %>% ggplot(aes(annee, lettres)) + geom_col()
dfAnimes %>%
group_by(annee) %>%
summarise(lettres=median(nchar(data.title))) -> medLettersByYear
medLettersByYear %>% ggplot(aes(annee, lettres)) + geom_col()
filter(data, data.animeSeason$year > 1924) -> post1924
filter(dfAnimes, dfAnimes.animeSeason$year > 1924) -> post1924
dfAnimes
filter(dfAnimes, dfAnimes$annee > 1924) -> post1924
filter(post1924,post1924$annee < 2022) -> animeCentury
animeCentury
View(dfAnimes)
View(dfAnimes)
#Filtrage des animes sortis avant 1925 et après 2021 et des OVAs, films, etc
dfAnimes %>%
filter(annee < 1924) %>%
filter(annee > 2021) %>%
filter(data.type == "TV") -> animeCentury
#Filtrage des animes sortis avant 1925 et après 2021 et des OVAs, films, etc
dfAnimes %>%
filter(annee < 1924) %>%
filter(annee > 2021) -> animeCentury
#Filtrage des animes sortis avant 1925 et après 2021 et des OVAs, films, etc
dfAnimes %>%
filter(annee > 1924) %>%
filter(annee < 2021) %>%
filter(data.type == "TV") -> animeCentury
View(animeCentury)
#Nombre d'animes durant plus de 2 cours (estimés à 30 épisodes) par an => a améliorer
animeCentury %>%
filter(data.episodes>30) %>%
group_by(annee) %>%
count() %>%
rename(nbAnimes = n) -> longbois
longbois %>% ggplot(aes(annee, nbAnimes)) + geom_col()
#Filtrage des animes sortis avant 1925 et après 2021 et des OVAs, films, etc
dfAnimes %>%
filter(annee > 1960) %>%
filter(annee < 2021) %>%
filter(data.type == "TV") -> animeCentury
#Evolution du nombre de lettre (moyenne et médiane) dans les titres par année
animeCentury %>%
group_by(annee) %>%
summarise(lettres=mean(nchar(data.title))) -> meanLettersByYear
animeCentury %>%
group_by(annee) %>%
summarise(lettres=median(nchar(data.title))) -> medLettersByYear
meanLettersByYear %>% ggplot(aes(annee, lettres)) + geom_col()
medLettersByYear %>% ggplot(aes(annee, lettres)) + geom_col()

Vedi file

@ -0,0 +1,66 @@
# Contribution guidelines
Please read the FAQ down below.
## Possible errors / problems in the database
If you find something that, in your opinion, could be the result of incorrectly extacted data, please submit an issue rather than creating a pull request, because the database is created by an automated process.
## Adding your project to the list of projects using this database
In case you have a project that uses this database and you want to add it to the list of projects that are using this database, create a pull request adding it to the table. Do not create an issue asking me or anyone else to add it.
+ You have to be the author/maintainer of the project that you want to add
+ Create a PR in which you add it to the table in the README.md
+ Do not change/alter anything else
+ Your project has to use this database
+ You have to have a link back to this project in the README.md of your project
+ The README.md of your project has to be in english or it must have an english translation
+ Your project has to be hosted either on github or gitlab
+ The table is sorted by project name (ascending). Add your entry accordingly.
+ Project name must match the repository name and link directly to the source code (not a project page such as YOURNAME.github.io)
+ Put your name under _Author/Maintainer_ with a link to your profile.
+ Add a meaningful description in english. The description must not be longer than 150 characters.
# FAQ
## What do you mean by 'meta data provider'?
Websites which provide information about anime such as `myanimelist.net`, `notify.moe`, ...
## Can you please add additional data/properties?
No. The dataset has been created for my own tool. It contains all data/properties that I need and I won't add more data/properties. This is merely an index. The idea is to visit the meta data provider of your choice to get additional information about the anime.
## Can you please add an additional meta data provider?
No. I don't plan to add any additional meta data provider.
## Can you please change the structure of the file?
No. The file has the structure that it needs to have for the purpose it has been built for.
## There are duplicates in the dataset.
If the entry of one meta data provider is not merged with an entry of a different meta data provider, although they are practically the same entry, then this is **not a duplicate**.
They are simply not merged together. This can happen and it is intentional. Since this dataset is created automatically two entries should rather not be merged than falsely merged together.
If you query this dataset based on titles/synonyms it might seem that there are duplicates. However the intended usage is to query by the url of the meta data provider. This way you will always retrieve the entry that you want. Entries being merged together is just a nice to have.
A duplicate by defintion of this dataset is an entry which contains multiple links of the same meta data provider in `sources`.
## Why are there no IDs?
There are. The entries under `sources` are the IDs. Each one of the array's URLs is a key for that specific entry.
## Is this dataset created automatically or manually?
It is created automatically and reviewed in a half-automated process.
## Do you plan to open source the code which creates this dataset?
Yes. Parts of the code are already [available](https://github.com/manami-project?tab=repositories&q=modb&type=source). However there is still work to do before I can/want to open source the rest and that doesn't have any priority right now.
## How do you split entries?
Entries are split if one meta data provider lists multiple entries as one and others don't.
**Example:**
* The entry of a meta data provider which lists 3 Movies as one entry is split from three separate entries of another meta data provider
* A series is listed as one entry having 26 episodes on one meta data provider and as two entries having 13 episodes each on the other meta data provider
However if one entry is listed with 13 episodes whereas the other is listed with 12, because it doesn't count the recap episode then these entries are still merged together.
## Can I somehow contribute?
Currently I can't think of a way. But you can check the [predefined issue templates](https://github.com/manami-project/anime-offline-database/issues/new/choose) in case you want to report to one of the available cases.
## Does this dataset contain all anime from the supported meta data provider?
No. MAL and anisearch are the only provider which list adult titles publicly. So this type of anime is missing for the other meta data providers.
If there are new entries which have been created after an update then those obviously won't appear until the next update.
Apart from that it should contain all titles from the supported meta data provider.

Vedi file

@ -0,0 +1,8 @@
blank_issues_enabled: false
contact_links:
- name: Guide to add your project to the project list.
url: https://github.com/manami-project/anime-offline-database/blob/master/.github/CONTRIBUTING.md#adding-your-project-to-the-list-of-projects-using-this-database
about: How to add your project to the list of projects using this database.
- name: FAQ
url: https://github.com/manami-project/anime-offline-database/blob/master/.github/CONTRIBUTING.md#faq
about: Frequently Asked Questions

Vedi file

@ -0,0 +1,51 @@
---
name: Falsely merged entry
about: Entries have been merged together although they should be separate entries?
title: ''
labels: ''
assignees: manami-project
---
Please read the [FAQ](https://github.com/manami-project/anime-offline-database/blob/master/.github/CONTRIBUTING.md#faq) first.
Especially the sections on [duplicates](https://github.com/manami-project/anime-offline-database/blob/master/.github/CONTRIBUTING.md#there-are-duplicates-in-the-data-set) and [splits](https://github.com/manami-project/anime-offline-database/blob/master/.github/CONTRIBUTING.md#how-do-you-split-entries). Please refrain from creating issues stating that entries should be merged together. This is only for _splitting_ entries which have already been merged together, but should be separated.
**Only one entry per issue**
## Which entry should be split? (original from data set)
**Example:**
```
"https://anidb.net/anime/9466",
"https://anilist.co/anime/15809",
"https://anime-planet.com/anime/the-devil-is-a-part-timer",
"https://kitsu.io/anime/7314",
"https://myanimelist.net/anime/15809",
"https://notify.moe/anime/CGnFpKimR"
"https://anidb.net/anime/16104",
"https://anilist.co/anime/130592",
"https://anime-planet.com/anime/the-devil-is-a-part-timer-2",
"https://kitsu.io/anime/44113",
"https://myanimelist.net/anime/48413",
"https://notify.moe/anime/Zy3-TV8MR"
```
## How should it be split?
**Example:**
```
"https://anidb.net/anime/9466",
"https://anilist.co/anime/15809",
"https://anime-planet.com/anime/the-devil-is-a-part-timer",
"https://kitsu.io/anime/7314",
"https://myanimelist.net/anime/15809",
"https://notify.moe/anime/CGnFpKimR"
```
```
"https://anidb.net/anime/16104",
"https://anilist.co/anime/130592",
"https://anime-planet.com/anime/the-devil-is-a-part-timer-2",
"https://kitsu.io/anime/44113",
"https://myanimelist.net/anime/48413",
"https://notify.moe/anime/Zy3-TV8MR"
```

Vedi file

@ -0,0 +1,19 @@
---
name: Problem in data extraction
about: Is there a problem in the data extraction?
title: ''
labels: ''
assignees: manami-project
---
Please read the [FAQ](https://github.com/manami-project/anime-offline-database/blob/master/.github/CONTRIBUTING.md#faq) first.
* Which data is not extracted correctly? (e.g. title, episodes...)
* Can you provide an example entry?
* Which value is expected?

Vedi file

@ -0,0 +1,10 @@
---
name: Question
about: You have a question which was not covered by the FAQ?
title: ''
labels: question
assignees: manami-project
---
Please read the [FAQ](https://github.com/manami-project/anime-offline-database/blob/master/.github/CONTRIBUTING.md#faq) first.

Vedi file

@ -0,0 +1,37 @@
name: Check JSON files
on:
push:
branches:
- '**'
paths-ignore:
- 'README.md'
- '.gitignore'
- '.gitattributes'
- '.github/**/*'
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Setup node environment
uses: actions/setup-node@v1
with:
node-version: '14'
- name: Install jsonlint
run: npm install jsonlint -g
- name: Check anime-offline-database.json
run: jsonlint -q anime-offline-database.json
- name: Check anime-offline-database-minified.json
run: jsonlint -q anime-offline-database-minified.json
- name: Check dead-entries for anidb
run: jsonlint -q dead-entries/anidb.json
- name: Check dead-entries for anilist
run: jsonlint -q dead-entries/anilist.json
- name: Check dead-entries for kitsu
run: jsonlint -q dead-entries/kitsu.json
- name: Check dead-entries for livechart
run: jsonlint -q dead-entries/livechart.json
- name: Check dead-entries for myanimelist
run: jsonlint -q dead-entries/myanimelist.json

Vedi file

@ -0,0 +1,29 @@
/*
!.gitignore
!README.md
!anime-offline-database.json
!anime-offline-database-minified.json
!.github/
.github/*
!.github/CONTRIBUTING.md
!.github/workflows/
.github/workflows/*
!.github/workflows/json_lint.yml
!.github/ISSUE_TEMPLATE/
.github/ISSUE_TEMPLATE/*
!.github/ISSUE_TEMPLATE/problem-in-data-extraction.md
!.github/ISSUE_TEMPLATE/question.md
!.github/ISSUE_TEMPLATE/falsely-merged-entries.md
!.github/ISSUE_TEMPLATE/config.yml
!dead-entries/
dead-entries/*
!dead-entries/anidb.json
!dead-entries/anilist.json
!dead-entries/kitsu.json
!dead-entries/myanimelist.json
!dead-entries/livechart.json

Vedi file

@ -0,0 +1,229 @@
![CI build status](https://github.com/manami-project/anime-offline-database/workflows/Check%20JSON%20files/badge.svg "CI build status: Check JSON files")
# anime-offline-database
The purpose of this repository is to create an offline database containing anime meta data aggregated by different anime meta data providers (such as myanimelist.net, anidb.net, kitsu.io and more) and allow cross references between those meta data providers. This file is supposed to be used by and created for [manami](https://github.com/manami-project/manami).
**The goal is to deliver at least weekly updates.**
## Statistics
Update **week 48 [2021]**
The database consists of **33043** entries composed of:
+ 23233 entries from myanimelist.net
+ 18215 entries from anime-planet.com
+ 17231 entries from kitsu.io
+ 16208 entries from anisearch.com
+ 15526 entries from anilist.co
+ 15175 entries from notify.moe
+ 12127 entries from anidb.net
+ 9562 entries from livechart.me
Missed updates:
+ **2021:** 0 _(so far)_
+ **2020:** 0
+ **2019:** 2
+ **2018:** 1
## Structure
This repository contains various JSON files. The database file itself as well as one file containing IDs of dead entries for each meta data provider to support the automated process.
### anime-offline-database-minified.json
Minified version of `anime-offline-database.json` which contains the same data, but is smaller in size.
### anime-offline-database.json
#### Data types
**Root**
| Field | Type | Nullable |
| --- | --- | --- |
| data | ```Anime[]``` | no |
**Anime**
| Field | Type | Nullable |
| --- | --- | --- |
| sources | ```URL[]``` | no |
| title | ```String``` | no |
| type | ```Enum of [TV, MOVIE, OVA, ONA, SPECIAL, UNKNOWN]``` | no |
| episodes | ```Integer``` | no |
| status | ```Enum of [FINISHED, ONGOING, UPCOMING, UNKNOWN]``` | no |
| animeSeason | ```AnimeSeason``` | no |
| picture | ```URL``` | no |
| thumbnail | ```URL``` | no |
| synonyms | ```String[]``` | no |
| relations | ```URL[]``` | no |
| tags | ```String[]``` | no |
**AnimeSeason**
| Field | Type | Nullable |
| --- | --- | --- |
| season | ```Enum of [SPRING, SUMMER, FALL, WINTER, UNDEFINED]``` | no |
| year | ```Integer``` | yes |
#### Example:
```json
{
"data": [
{
"sources": [
"https://anidb.net/anime/4563",
"https://anilist.co/anime/1535",
"https://anime-planet.com/anime/death-note",
"https://anisearch.com/anime/3633",
"https://kitsu.io/anime/1376",
"https://livechart.me/anime/3437",
"https://myanimelist.net/anime/1535",
"https://notify.moe/anime/0-A-5Fimg"
],
"title": "Death Note",
"type": "TV",
"episodes": 37,
"status": "FINISHED",
"animeSeason": {
"season": "FALL",
"year": 2006
},
"picture": "https://cdn.myanimelist.net/images/anime/9/9453.jpg",
"thumbnail": "https://cdn.myanimelist.net/images/anime/9/9453t.jpg",
"synonyms": [
"Bilježnica smrti",
"Caderno da Morte",
"Carnet de la Mort",
"DEATH NOTE",
"DN",
"Death Note - A halállista",
"Death Note - Carnetul morţii",
"Death Note - Zápisník smrti",
"Mirties Užrašai",
"Notatnik śmierci",
"Notes Śmierci",
"Quaderno della Morte",
"Sveska Smrti",
"Ölüm Defteri",
"Τετράδιο Θανάτου",
"Бележник на Смъртта",
"Записник Смерті",
"Свеска Смрти",
"Тетрадка на Смъртта",
"Тетрадь cмерти",
"Үхлийн Тэмдэглэл",
"מחברת המוות",
"دفترچه مرگ",
"دفترچه یادداشت مرگ",
"كـتـاب الـموت",
"مدونة الموت",
"مذكرة الموت",
"موت نوٹ",
"डेथ नोट",
"ですのーと",
"デスノート",
"死亡笔记",
"데스노트"
],
"relations": [
"https://anidb.net/anime/8146",
"https://anidb.net/anime/8147",
"https://anilist.co/anime/2994",
"https://anime-planet.com/anime/death-note-rewrite-1-visions-of-a-god",
"https://anime-planet.com/anime/death-note-rewrite-2-ls-successors",
"https://anisearch.com/anime/4441",
"https://anisearch.com/anime/5194",
"https://kitsu.io/anime/2707",
"https://livechart.me/anime/3808",
"https://myanimelist.net/anime/2994",
"https://notify.moe/anime/DBBU5Kimg"
],
"tags": [
"alternative present",
"amnesia",
"anti-hero",
"asexual",
"asia",
"based on a manga",
"contemporary fantasy",
"cops",
"crime",
"crime fiction",
"criminals",
"detective",
"detectives",
"drama",
"earth",
"espionage",
"fantasy",
"genius",
"gods",
"hero of strong character",
"horror",
"japan",
"kamis",
"kuudere",
"male protagonist",
"manga",
"mind games",
"mystery",
"overpowered main characters",
"philosophy",
"plot continuity",
"police",
"policeman",
"present",
"primarily adult cast",
"primarily male cast",
"psychological",
"psychological drama",
"psychopaths",
"revenge",
"rivalries",
"secret identity",
"serial killers",
"shinigami",
"shounen",
"supernatural",
"supernatural drama",
"thriller",
"time skip",
"tragedy",
"twisted story",
"university",
"urban",
"urban fantasy",
"vigilantes"
]
}
]
}
```
### dead-entries
Contains IDs which have been removed from the database of the corresponding meta data provider.
#### Data types
| Field | Type | Nullable |
| --- | --- | --- |
| deadEntries | ```String[]``` | no |
#### Example
```json
{
"deadEntries": [
"38492",
"38518",
"38522",
"38531"
]
}
```
## Other projects using this database
If you have a project that uses this database and you want to add it to this list, please read the [contribution guidelines](./.github/CONTRIBUTING.md) first.
|Project|Author/Maintainer|Short description|
|----|----|----|
|[adb-zeppelin-statistics](https://github.com/manami-project/adb-zeppelin-statistics)|[manami-project](https://github.com/manami-project)|A set of statistics and insights about anime on MAL.|
|[animanga-wordlist](https://github.com/ryuuganime/animanga-wordlist)|[ryuuganime](https://github.com/ryuuganime)|Japanese Anime, Manga, Characters, and Studio Word List/Dictionary|
|[arm-server](https://github.com/BeeeQueue/arm-server)|[BeeeQueue](https://github.com/BeeeQueue)|A REST API for querying this database.|
|[manami](https://github.com/manami-project/manami)|[manami-project](https://github.com/manami-project)|A tool to catalog anime on your hard drive and discover new anime to watch.|

File diff soppresso perché una o più righe sono troppo lunghe

File diff soppresso perché troppo grande Carica differenze

File diff soppresso perché troppo grande Carica differenze

File diff soppresso perché troppo grande Carica differenze

File diff soppresso perché troppo grande Carica differenze

File diff soppresso perché troppo grande Carica differenze

File diff soppresso perché troppo grande Carica differenze

52
main.R File normale
Vedi file

@ -0,0 +1,52 @@
library("jsonlite")
library("tidyverse")
data<-fromJSON("C:\\Users\\Marianne\\Desktop\\projet-analyse-exploratoire\\anime-offline-database-master\\anime-offline-database.json")
dfAnimes <- as.data.frame(data)
#Nettoyage des colonnes non utilisées
dfAnimes <- select(dfAnimes, data.title, data.type, data.episodes, data.status, data.animeSeason, data.tags)
#Dedoublage de la colonne saison
dfAnimes %>%
mutate(annee = data.animeSeason$year) %>%
mutate(saison = data.animeSeason$season) -> dfAnimes
#Filtrage des animes sortis avant 1960 (pas de télé :() et après 2021 et des OVAs, films, etc
dfAnimes %>%
filter(annee > 1960) %>%
filter(annee < 2021) %>%
filter(data.type == "TV") -> animeCentury
#Nombre d'animes durant plus de 2 cours (estimés à 30 épisodes) par an => a améliorer
animeCentury %>%
filter(data.episodes>30) %>%
group_by(annee) %>%
count() %>%
rename(nbAnimes = n) -> longbois
longbois %>% ggplot(aes(annee, nbAnimes)) + geom_col()
#Evolution du nombre de lettre (moyenne et médiane) dans les titres par année
animeCentury %>%
group_by(annee) %>%
summarise(lettres=mean(nchar(data.title))) -> meanLettersByYear
animeCentury %>%
group_by(annee) %>%
summarise(lettres=median(nchar(data.title))) -> medLettersByYear
meanLettersByYear %>% ggplot(aes(annee, lettres)) + geom_col()
medLettersByYear %>% ggplot(aes(annee, lettres)) + geom_col()
#Evolution des 5 tags les plus représentés
animeCentury %>%
group_by(annee) %>%

File binario non mostrato.

Vedi file

@ -0,0 +1,35 @@
# ---> R
# History files
.Rhistory
.Rapp.history
# Session Data files
.RData
# Example code in package build process
*-Ex.R
# Output files from R CMD build
/*.tar.gz
# Output files from R CMD check
/*.Rcheck/
# RStudio files
.Rproj.user/
# produced vignettes
vignettes/*.html
vignettes/*.pdf
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth
# knitr and R markdown default cache directories
/*_cache/
/cache/
# Temporary files created by R markdown
*.utf8.md
*.knit.md

Vedi file

@ -0,0 +1,2 @@
# tp-analyse-explorative

Vedi file

@ -0,0 +1,125 @@
### 1 Vectors ###
### 1.1 Let's start simple
e1 = c(2,5,0,8)
e2 = 1:200
e3 = seq(-210,-200,2)
e4 = 2^1:7
v = rep(c(1,-1),times=25)
e5 = c(e2,e3)
?seq
e6 = seq(0,1,length=70)
e7 = rep(e1,times=10)
e2-e3
### 1.2 Character vectors
# All vowels
vowels = c('a','e','i','o','u','y')
# All letters
letters = letters
# True or false if vowels are in letters
letters %in% vowels
# Index of each vowels
vowelsInLetters = which(letters %in% vowels)
# Index of each non-vowels
notVowelsInLetters = which(!(letters %in% vowels))
# Letters after vowels
lettersAfterVowels = letters[vowelsInLetters+1]
# myname with my name
myname = "Titouan"
# strsplit to extract letters
mynameSplited = strsplit(myname, NULL)
# Access its first element
mynameSplited[[1]][1]
mynameSplited[[1]]
mynameSplited[1]
# Index in alphabet of letters in my name
indexMyname = which(letters %in% mynameSplited[[1]])
# Index in alphabet of letters in my neighbour name
neighborname = "Marianne"
neighbornameSplited = strsplit(neighborname, NULL)
neighbornameSplited[[1]][1]
indexNeighborname = which(letters %in% neighbornameSplited[[1]])
# Min of average of index
mynameAverage = mean(indexMyname)
neighborAverage = mean(indexNeighborname)
minIndexOfName = min(c(mynameAverage,neighborAverage))
### 2 DataFrames ###
### 2.1 Cute Animals
# All vowels
vowels = c('a','e','i','o','u','y')
# All letters
letters = letters
# Database of 3 columns : alphabet letters, number of each letter, binary variable vowel
database = data.frame(letter=letters, index=1:26, isVowels=letters %in% vowels)
# Extracting lines corresponding to my name
indexMyname = letters %in% strsplit("Titouan", NULL)[[1]]
database[indexMyname,"index"]
# Examining msleep dataset
library(tidyverse)
head(msleep)
str(msleep)
names(msleep)
summary
# Sanity check animals either awake or asleep
which(msleep$sleep_total + msleep$awake == 24 )
# Animal that sleep the most
msleep[which.max(msleep$sleep_total),]
# Animals of less than 100g and that sleeps more than half a day
nrow(msleep[which(msleep$sleep_total > 12 & msleep$bodywt < 0.1),])
# Average brainwt/bodywt ratio (ajoute une colonne ratio)
msleep$ratio = msleep$brainwt/msleep$bodywt
mean(msleep$ratio, na.rm = TRUE)
# Animal with the highest ratio
msleep[which.max(msleep$ratio),]
### 2.2 Endangered animals
# Create a copy and reorder its factors
msleep_copy = msleep
msleep_copy$conservation = factor(x = msleep_copy$conservation, c("lc","domesticated","cd","nt","vu","en"), ordered = TRUE)
# Compare average weight of endangered animals to non-endangered
averageWtThreatened = mean(msleep_copy[which(msleep_copy$conservation > "nt"),]$bodywt)
averageWtRemaining = mean(msleep_copy[which(msleep_copy$conservation <= "nt"),]$bodywt)
# Ajoute une colonne threatened, valeur booléenne
msleep$threatened = msleep$conservation > "nt"
### 2.3 Functions
# Create a function taking a name as input and returning corresponding list of letters
lettersFromName <- function(name) {
return(strsplit(name, NULL)[[1]])
}
lettersFromName("Titouan")
# Fix with empty "" name
lettersFromName("")
lettersFromNameFixed <- function(name) {
if (name == "") {
l = NULL
}
else {
l = strsplit(name, NULL)[[1]]
}
return(l)
}
lettersFromNameFixed("")
# With non-standards characters
lettersFromNameFixed("X Æ A-12")
# Function returning corresponding genus from animal name
genusFromName <- function(name) {
genusname <- msleep[tolower(msleep$name) == tolower(name),]$genus
if (length(genusname) == 0) {
s <- c("I don't know")
}
else {
s <- c("The ",name," is a ", genusname)
}
#browser()
return(paste(s, collapse = ''))
}
genusFromName("little brown bat")

File binario non mostrato.

Vedi file

@ -0,0 +1,315 @@
library(tidyverse)
library(directlabels)
####### 1 Plot and Given names #######
### Exploring the Pink City
# read the table prenoms.csv
prenoms <- read.csv('/home/labourde/Bureau/tp-analyse-explorative/prenoms.csv',sep=";")
# Inspect it
head(prenoms)
# Plot the number of births by year
prenoms %>%
group_by(Année) %>%
summarise(Naissances=sum(Nombre)) %>%
ggplot(aes(x=Année,y=Naissances)) +
geom_point()
# Plot the number of male/female births by year
prenoms %>%
group_by(Année,Sexe) %>%
summarise(Naissances=sum(Nombre)) %>%
ggplot(aes(x=Année,y=Naissances)) +
geom_point(aes(color=Sexe))
# Is my name in the dataset ?
prenoms[tolower(prenoms$Prénom) == tolower("Titouan"),]
# Represent the 10 most given names
prenoms %>%
group_by(Prénom) %>%
summarise(Naissances=sum(Nombre)) %>%
arrange(desc(Naissances)) -> mostGivenNames
ggplot(head(mostGivenNames,10),aes(x=Prénom,y=Naissances)) +
geom_bar(stat='Identity',aes(fill=Prénom))
# Select for each year the top 5 given names by sex and their evolution through the years
prenoms[prenoms$Sexe == "M",] %>%
group_by(Année) %>%
slice_max(order_by = Nombre, n = 5) -> mostMaleGivenNamesByYear
prenoms[prenoms$Sexe == "F",] %>%
group_by(Année) %>%
slice_max(order_by = Nombre, n = 5) -> mostFemaleGivenNamesByYear
mostGivenNamesBySexeAndYear <- rbind(mostMaleGivenNamesByYear, mostFemaleGivenNamesByYear)
ggplot(mostGivenNamesBySexeAndYear,aes(x=Année,y=Nombre)) +
geom_point(aes(color=Prénom)) +
facet_wrap("Prénom")
# Plot the average numbers of letters by years
prenoms %>%
group_by(Année) %>%
summarise(Lettres=mean(nchar(Prénom))) -> averageNumberOfLettersByYear
ggplot(averageNumberOfLettersByYear,aes(x=Année,y=Lettres)) +
geom_line()
# Plot the average numbers of vowels/consonants by years
prenoms %>%
group_by(Année) %>%
summarise(
Vowels = mean(nchar(gsub("[éèêëàâäïaeiouy]", "", tolower(Prénom)))),
Consonants = mean(nchar(gsub("[zrtpqsdfghjklmwxcvbnç]", "", tolower(Prénom))))
) -> averageNumberOfVowelsAndConsonantsByYear
ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Vowels)) +
geom_point()
ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Consonants)) +
geom_point()
# How the number of composed names change through the years
prenoms[str_detect(prenoms$Prénom, regex("[a-zA-Zéèàï]+-[a-zA-ZéèàÎ]+")),] %>%
group_by(Année) %>%
summarise(nb_composedNames=sum(Nombre)) -> composedNamesByYear
ggplot(composedNamesByYear,aes(x=Année,y=nb_composedNames)) +
geom_line()
# Defining a "hype" criteria and finding the hypest names
prenoms %>%
group_by(Année) -> namesByYear
namesByYear$nbAnnéePrec <- lag(namesByYear$Nombre)
namesByYear$difference <- namesByYear$Nombre - namesByYear$nbAnnéePrec
namesByYear %>%
group_by(Année) %>%
slice_max(order_by = difference, n = 1) -> hypestNameByYear
ggplot(hypestNameByYear,aes(x=factor(Année),y=difference)) +
geom_text(aes(label=Prénom))
### Exploring the Gray City
# read the table prenomsParis.csv
prenomsParis <- read.csv('/home/labourde/Bureau/tp-analyse-explorative/prenomsParis.csv',sep=";")
prenomsParis <-rename(prenomsParis, Année = Annee)
prenomsParis <-rename(prenomsParis, Prénom = Prenoms)
nameStudy <- function(prenoms) {
# Plot the number of births by year
prenoms %>%
group_by(Année) %>%
summarise(Naissances=sum(Nombre)) %>%
ggplot(aes(x=Année,y=Naissances)) +
ggtitle("Naissances par année") +
geom_point() -> p1
# Plot the number of male/female births by year
prenoms %>%
group_by(Année,Sexe) %>%
summarise(Naissances=sum(Nombre)) %>%
ggplot(aes(x=Année,y=Naissances)) +
ggtitle("Sexe des naissances par année") +
geom_point(aes(color=Sexe)) -> p2
# Is my name in the dataset ?
prenoms[tolower(prenoms$Prénom) == tolower("Titouan"),]
# Represent the 10 most given names
prenoms %>%
group_by(Prénom) %>%
summarise(Naissances=sum(Nombre)) %>%
arrange(desc(Naissances)) -> mostGivenNames
ggplot(head(mostGivenNames,10),aes(x=reorder(Prénom,Naissances),y=Naissances)) +
ggtitle("Les 10 noms les plus donnés") +
geom_bar(stat='Identity',aes(fill=reorder(Prénom,Naissances))) -> p3
# Select for each year the top 5 given names by sex and their evolution through the years
prenoms[prenoms$Sexe == "M",] %>%
group_by(Année) %>%
slice_max(order_by = Nombre, n = 5) -> mostMaleGivenNamesByYear
prenoms[prenoms$Sexe == "F",] %>%
group_by(Année) %>%
slice_max(order_by = Nombre, n = 5) -> mostFemaleGivenNamesByYear
mostGivenNamesBySexeAndYear <- rbind(mostMaleGivenNamesByYear, mostFemaleGivenNamesByYear)
ggplot(mostGivenNamesBySexeAndYear,aes(x=Année,y=Nombre)) +
ggtitle("Top 5 des noms donnés, par sexe et par an") +
geom_point(aes(color=Prénom)) +
facet_wrap("Prénom") -> p4
# Plot the average numbers of letters by years
prenoms %>%
group_by(Année) %>%
summarise(Lettres=mean(nchar(Prénom))) -> averageNumberOfLettersByYear
ggplot(averageNumberOfLettersByYear,aes(x=Année,y=Lettres)) +
ggtitle("Nombre moyen de lettres dans les prénoms par année") +
geom_line() -> p5
# Plot the average numbers of vowels/consonants by years
prenoms %>%
group_by(Année) %>%
summarise(
Vowels = mean(nchar(gsub("[éèêëàâäïaeiouy]", "", tolower(Prénom)))),
Consonants = mean(nchar(gsub("[zrtpqsdfghjklmwxcvbnç]", "", tolower(Prénom))))
) -> averageNumberOfVowelsAndConsonantsByYear
ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Vowels)) +
ggtitle("Nombre moyen de voyelles dans les prénoms par année") +
geom_point() -> p6
ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Consonants)) +
ggtitle("Nombre moyen de consonnes dans les prénoms par année") +
geom_point() -> p7
# How the number of composed names change through the years
prenoms[str_detect(prenoms$Prénom, regex("[a-zA-Zéèàï]+-[a-zA-ZéèàÎ]+")),] %>%
group_by(Année) %>%
summarise(nb_composedNames=sum(Nombre)) -> composedNamesByYear
ggplot(composedNamesByYear,aes(x=Année,y=nb_composedNames)) +
ggtitle("Nombre de prénoms composés par année") +
geom_line() -> p8
# Defining a "hype" criteria and finding the hypest names
prenoms %>%
group_by(Année) -> namesByYear
namesByYear$nbAnnéePrec <- lag(namesByYear$Nombre)
namesByYear$difference <- namesByYear$Nombre - namesByYear$nbAnnéePrec
namesByYear %>%
group_by(Année) %>%
slice_max(order_by = difference, n = 1) -> hypestNameByYear
ggplot(hypestNameByYear,aes(x=factor(Année),y=difference)) +
ggtitle("Prénom le plus hype par année") +
geom_text(aes(label=Prénom)) -> p9
return(list(p1,p2,p3,p4,p5,p6,p7,p8,p9))
}
plotsParis <- nameStudy(prenomsParis)
plotsParis
# A tale of two (or more) cities
prenoms <- subset( prenoms, select = -Ordre )
prenoms$Ville <- "Toulouse"
prenomsParis$Ville <- "Paris"
allPrenoms <- rbind(prenoms,prenomsParis)
allPrenoms %>%
group_by(Année,Ville) %>%
mutate(NaissancesVilleAnnée=sum(Nombre)) -> allPrenoms
nameStudyCombined <- function(prenoms) {
# Plot the number of births by year
prenoms %>%
group_by(Année,Ville) %>%
summarise(Naissances=sum(Nombre)) %>%
ggplot(aes(x=Année,y=Naissances)) +
geom_point(aes(color=Ville)) +
geom_line(aes(color=Ville)) +
ggtitle("Naissances par année") -> p1
prenoms %>%
group_by(Année,Ville) %>%
mutate(NaissancesVilleAnnée=sum(Nombre)) -> prenoms
# Plot the number of male/female births by year and by cities
prenoms %>%
group_by(Année,Sexe,Ville) %>%
summarise(Naissances=100*sum(Nombre)/NaissancesVilleAnnée) %>%
ggplot(aes(x=Année,y=Naissances)) +
geom_point(aes(color=Sexe)) +
facet_wrap("Ville") +
ggtitle("Sexe des naissances par année en %") -> p2
# Represent the 10 most given names by cities
prenoms %>%
group_by(Ville) %>%
mutate(NaissancesVille = sum(Nombre)) %>%
group_by(Prénom,Ville) %>%
summarise(Naissances=100*sum(Nombre)/sum(NaissancesVille)) %>%
group_by(Ville) %>%
slice_max(order_by = Naissances, n = 10) -> mostGivenNames
ggplot(mostGivenNames,aes(x=reorder(Prénom,Naissances),y=Naissances)) +
ggtitle("Les 10 noms les plus donnés par ville") +
geom_bar(stat='Identity',aes(fill=reorder(Prénom,Naissances))) +
facet_wrap("Ville") -> p3
# Select for each year the top 5 given names by sex and their evolution through the years by cites
tryCatch(
{
prenoms[prenoms$Sexe == "M",] %>%
group_by(Ville,Année) %>%
slice_max(order_by = Nombre, n = 5) -> mostMaleGivenNamesByYear
prenoms[prenoms$Sexe == "F",] %>%
group_by(Ville,Année) %>%
slice_max(order_by = Nombre, n = 5) -> mostFemaleGivenNamesByYear
mostGivenNamesBySexeAndYear <- rbind(mostMaleGivenNamesByYear, mostFemaleGivenNamesByYear)
mostGivenNamesBySexeAndYear %>%
group_by(Prénom,Ville,Année) %>%
mutate(Naissances=100*sum(Nombre)/sum(NaissancesVilleAnnée)) -> mostGivenNamesBySexeAndYearNormalized
ggplot(mostGivenNamesBySexeAndYearNormalized,aes(x=Année,y=Naissances)) +
ggtitle("Top 5 des noms donnés, par sexe, par an et par ville") +
geom_line(aes(color=Prénom)) +
geom_point(aes(color=Prénom,shape=Sexe),size=3) +
geom_dl(aes(label=Prénom), method=list(dl.combine("first.points")), cex = 0.8) +
facet_wrap("Ville")
},
error=function(e) e
) -> p4
# Plot the average numbers of letters by years by cities
prenoms %>%
group_by(Année,Ville) %>%
summarise(Lettres=mean(nchar(Prénom))) -> averageNumberOfLettersByYear
ggplot(averageNumberOfLettersByYear,aes(x=Année,y=Lettres)) +
ggtitle("Nombre moyen de lettres dans les prénoms par année et par ville") +
geom_point(aes(color=Ville)) +
geom_line(aes(color=Ville)) -> p5
# Plot the average numbers of vowels/consonants by years and by cities
prenoms %>%
group_by(Ville, Année) %>%
summarise(
Vowels = mean(nchar(gsub("[éèêëàâäïaeiouy]", "", tolower(Prénom)))),
Consonants = mean(nchar(gsub("[zrtpqsdfghjklmwxcvbnç]", "", tolower(Prénom))))
) -> averageNumberOfVowelsAndConsonantsByYear
ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Vowels)) +
ggtitle("Nombre moyen de voyelles dans les prénoms par année et par ville") +
geom_point(aes(color=Ville)) +
geom_line(aes(color=Ville)) -> p6
ggplot(averageNumberOfVowelsAndConsonantsByYear, aes(x=Année,y=Consonants)) +
ggtitle("Nombre moyen de consonnes dans les prénoms par année et par ville") +
geom_point(aes(color=Ville)) +
geom_line(aes(color=Ville)) -> p7
# How the number of composed names change through the years
prenoms[str_detect(prenoms$Prénom, regex("[a-zA-Zéèàï]+-[a-zA-ZéèàÎ]+")),] %>%
group_by(Année,Ville) %>%
summarise(nb_composedNames=100*sum(Nombre)/NaissancesVilleAnnée) -> composedNamesByYear
ggplot(composedNamesByYear,aes(x=Année,y=nb_composedNames)) +
ggtitle("Nombre de prénoms composés par année et par ville") +
geom_point(aes(color=Ville)) +
geom_line(aes(color=Ville)) -> p8
# Defining a "hype" criteria and finding the hypest names
prenoms %>%
group_by(Année,Ville) -> namesByYearAndCities
namesByYearAndCities$nbAnnéePrec <- lag(namesByYearAndCities$Nombre)
namesByYearAndCities$difference <- namesByYearAndCities$Nombre - namesByYearAndCities$nbAnnéePrec
namesByYearAndCities %>%
group_by(Année,Ville) %>%
slice_max(order_by = difference, n = 1) -> hypestNameByYearAndCities
ggplot(hypestNameByYearAndCities,aes(x=factor(Année),y=difference/NaissancesVilleAnnée)) +
ggtitle("Prénom le plus hype par année et par ville") +
geom_text(aes(label=Prénom,color=Prénom)) +
facet_wrap("Ville") -> p9
return(list(p1,p2,p3,p4,p5,p6,p7,p8,p9))
}
combinedParisToulouse <- nameStudyCombined(allPrenoms)
combinedParisToulouse
# The most unshared names
allPrenoms %>%
group_by(Ville) %>%
mutate(NaissancesVille=sum(Nombre)) %>%
group_by(Prénom,Ville) %>%
mutate(Proportion=sum(Nombre)/NaissancesVille) %>%
group_by(Prénom,Ville) -> namesByCities
namesByCities$autreVilleProportion <- lag(namesByCities$Proportion)
namesByCities$difference <- namesByCities$Proportion - namesByCities$autreVilleProportion
### A Tale of many cities
# read the table prenomsRennesStrassNantesToul.csv
prenomsRennesStrassNantesToul <- read.csv('/home/labourde/Bureau/tp-analyse-explorative/prenomsRennesStrassNantesToul.csv',sep=";")
prenomsRennesStrassNantesToul <-rename(prenomsRennesStrassNantesToul, Année = ANNAISS)
prenomsRennesStrassNantesToul <-rename(prenomsRennesStrassNantesToul, Ville = LBCOM)
prenomsRennesStrassNantesToul <-rename(prenomsRennesStrassNantesToul, Sexe = SEX)
prenomsRennesStrassNantesToul <-rename(prenomsRennesStrassNantesToul, Prénom = PRN)
prenomsRennesStrassNantesToul <-rename(prenomsRennesStrassNantesToul, Nombre = NRB)
prenomsRennesStrassNantesToul[prenomsRennesStrassNantesToul$Ville=="RENNES",] -> a
combinedRennesStrassNantesToul <- nameStudyCombined(prenomsRennesStrassNantesToul)
combinedRennesStrassNantesToul[1]
combinedRennesStrassNantesToul[2]
combinedRennesStrassNantesToul[3]
combinedRennesStrassNantesToul[4]
combinedRennesStrassNantesToul[5]
combinedRennesStrassNantesToul[6]
combinedRennesStrassNantesToul[7]
combinedRennesStrassNantesToul[8]
combinedRennesStrassNantesToul[9]

File binario non mostrato.

File binario non mostrato.

File binario non mostrato.

File diff soppresso perché troppo grande Carica differenze

File diff soppresso perché troppo grande Carica differenze