!pip install gdown==v4.6.3
import gdown
import os
# Google Drive URL for the .txt file
gdrive_url = "https://drive.google.com/uc?id=1fCDV6WETqoO1G7EPGGbdAmGGcrKbTLmW"
#gdrive_url = "https://drive.google.com/uc?id=1NXM_XsjctF46Y4Yc3lW0HVasAXwuPoIJ"

# Create the target directory for .txt content if it doesn't exist
target_directory_txt = "/content"

# Download the .txt file into the target directory
gdown.download(gdrive_url, os.path.join(target_directory_txt, "Cada.csv"), quiet=False)

Collecting gdown==v4.6.3
  Downloading gdown-4.6.3-py3-none-any.whl.metadata (4.4 kB)
Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from gdown==v4.6.3) (3.17.0)
Requirement already satisfied: requests[socks] in /usr/local/lib/python3.11/dist-packages (from gdown==v4.6.3) (2.32.3)
Requirement already satisfied: six in /usr/local/lib/python3.11/dist-packages (from gdown==v4.6.3) (1.17.0)
Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from gdown==v4.6.3) (4.67.1)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (from gdown==v4.6.3) (4.12.3)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown==v4.6.3) (2.6)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown==v4.6.3) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown==v4.6.3) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown==v4.6.3) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown==v4.6.3) (2024.12.14)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown==v4.6.3) (1.7.1)
Downloading gdown-4.6.3-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 5.2.0
    Uninstalling gdown-5.2.0:
      Successfully uninstalled gdown-5.2.0
Successfully installed gdown-4.6.3

Downloading...
From: https://drive.google.com/uc?id=1fCDV6WETqoO1G7EPGGbdAmGGcrKbTLmW
To: /content/Cada.csv
100%|██████████| 181M/181M [00:03<00:00, 49.5MB/s]

'/content/Cada.csv'

import numpy as np
import pandas as pd

df = pd.read_csv("Cada.csv", header="infer", encoding="utf8")
df = df.fillna("") # A lot of the manipulation we'll do won't work if the dataframe has too many "N/A" values -
# and so we specify that all these empty values are actually empty strings
print(len(df))  # We check how long the dataframe is
print("___________")
print(df.head(15))  # We check that it is well-loaded by printing the top 15 rows
print("___________")
df.describe()

52982
___________
    Numéro de dossier                                     Administration  \
0            19840002                             ministre de la défense   
1            19840003                             ministre de la défense   
2            19850001  ministre de l'économie, des finances et du budget   
3            19860001  directeur départemental du travail et de l'emp...   
4            19870061                  gouverneur de la Banque de France   
5            19870380  directeur de la caisse régionale d'assurance m...   
6            19881774                       directeur général des impôts   
7            19881993                            ministre de l'intérieur   
8            19890099  ministre de l'intérieur (direction de la logis...   
9            19890845  ministre de la justice (direction de l'adminis...   
10           19900254                                 maire de Dunkerque   
11           19900695  chef de l'inspection générale des affaires soc...   
12           19900868     Président du conseil général de la Haute-Corse   
13           19901565           Président de l'Ecole privée Saint-Joseph   
14           19901932  directeur de la caisse d'allocations familiale...   

       Type  Année      Séance  \
0      Avis   1984  03/03/1984   
1      Avis   1984  13/12/1984   
2      Avis   1985  05/12/1985   
3   Conseil   1986  30/01/1986   
4      Avis   1987  02/04/1987   
5      Avis   1987  30/04/1987   
6      Avis   1988  24/11/1988   
7      Avis   1988  22/12/1988   
8      Avis   1989  19/01/1989   
9   Conseil   1989  18/05/1989   
10  Conseil   1990  08/02/1990   
11     Avis   1990  10/05/1990   
12  Conseil   1990  23/05/1990   
13     Avis   1990  27/09/1990   
14     Avis   1990  22/11/1990   

                                                Objet  \
0                                         [Voir avis]   
1                                         [Voir avis]   
2                                         [Voir avis]   
3                                         (voir avis)   
4                                         [Voir avis]   
5                                         [Voir avis]   
6                                         (voir avis)   
7                                         [Voir avis]   
8   - instructions, circulaires et guides opérateu...   
9   - "tableau de service" du centre de détention ...   
10  - liste des personnels handicapés employés par...   
11  - dossier déposé par Monsieur G. pour le trans...   
12  - communication à un tiers de l'arrêté radiant...   
13  - budget de l'école privée Saint-Joseph pour l...   
14  - lettre émanant de Mme C. figurant au dossier...   

                                  Thème et sous thème  \
0   Justice, Ordre Public Et Sécurité/Défense Et D...   
1   Economie, Industrie, Agriculture/Secteurs Écon...   
2   Justice, Ordre Public Et Sécurité/Association,...   
3                  Travail Et Emploi/Droit Du Travail   
4   Economie, Industrie, Agriculture/Secteurs Écon...   
5   Affaires Sanitaires Et Sociales/Solidarités Et...   
6   Finances Publiques Et Fiscalité/Contrôle Fisca...   
7   Justice, Ordre Public Et Sécurité/Sécurité Civ...   
8   Justice, Ordre Public Et Sécurité/Sécurité Civ...   
9   Travail Et Emploi/Emploi Public, Justice, Ordr...   
10                    Travail Et Emploi/Emploi Public   
11  Affaires Sanitaires Et Sociales/Santé Publique...   
12                    Travail Et Emploi/Emploi Public   
13  Enseignement, Culture, Loisirs/Enseignement Sc...   
14  Affaires Sanitaires Et Sociales/Solidarités Et...   

                                  Mots clés  \
0                                     Armée   
1                                     Armée   
2                                             
3       Licenciement, Conditions de travail   
4                     Banques et assurances   
5   Sécurité sociale, Conditions de travail   
6                                             
7               Politique et réglementation   
8               Politique et réglementation   
9              Carrière, Lieux de détention   
10                          Carrière, Santé   
11                               Médicament   
12                 Territoriale, Discipline   
13                                            
14                         Sécurité sociale   

                                   Sens et motivation Partie  \
0                                 Défavorable/Défense    III   
1   Irrecevable/Imprécise, Défavorable/Défense, In...    III   
2                                         Défavorable    III   
3                   Défavorable/Procédure, Sans objet    III   
4   Défavorable/Sécurité publique, Favorable/Sauf ...    III   
5                                  Défavorable/Secret    III   
6                               Défavorable/Procédure    III   
7                     Défavorable/Abusive, Sans objet    III   
8                                           Favorable    III   
9                       Défavorable/Sécurité publique     II   
10                             Défavorable/Vie privée     II   
11               Favorable/Sauf commercial industriel     II   
12                                          Favorable     II   
13                                          Favorable    III   
14                                          Favorable     II   

                                                 Avis  
0   La commission d'accès aux documents administra...  
1   La commission d'accès aux documents administra...  
2   La commission d'accès aux documents administra...  
3   La commission d'accès aux documents administra...  
4   La commission d'accès aux documents administra...  
5   La commission d'accès aux documents administra...  
6   La commission d'accès aux documents administra...  
7   La commission d'accès aux documents administra...  
8   La commission d'accès aux documents administra...  
9   La commission d'accès aux documents administra...  
10  communication de la liste des personnels handi...  
11  La commission d'accès aux documents administra...  
12  La commission d'accès aux documents administra...  
13  La commission d'accès aux documents administra...  
14  favorable à la communication à Mme C., par vou...  
___________

# I am Monsieur X

df.loc[df.Avis.str.contains("l'investisseur X")]

print(df.Type.value_counts(normalize=True))  # We can also multiply by * 100 to get proper percentages
df.groupby("Année").size().plot()  # We group by years, and then use plot the have an idea of the distribution

Type
Avis        0.940772
Conseil     0.059171
Sanction    0.000057
Name: proportion, dtype: float64

<Axes: xlabel='Année'>

df.Année.hist(bins=20)  # Another approach is to use an histogram, with all years divided into x bins

df.groupby("Année").Type.value_counts().unstack(level=0)
# We combine both tools to get a broader type of chart, and then we unstack

import matplotlib.pyplot as plt # This is a standard plotting module

df.index = pd.to_datetime(df.Séance, format="%d/%m/%Y")  # To analyse time series, your dataframe should be indexed chronologically,
# which you can do by replacing the index with this function that creates datetime objects from a column with dates
plt.figure(figsize=(15,7))  # To manage the following graphs, we initialise an empty plot
plt.subplot(131) # Then in that subplot we indicate that we want to have a suplot at that index
df.resample("1Y").size().plot()  # The method "resample" is equivalent to the groupby above, though it works on the index;
# since the index are dates,  you can resample by quarters, month, etc.
plt.subplot(132)
df.resample("1M").size().plot()
df = df.sort_index() # Necessary for next line to work
df = df["2010-01-01":"2025-01-01"]  # Data before 2010 is not very relevant, let's cut it out by indexing
x = df.resample("1M", convention="start").size()
xroll = x.rolling(5).sum() # Let's get a rolling average to get thing in better perspective
plt.subplot(133)
xroll.plot()

<ipython-input-6-ce6b6230d4c5>:7: FutureWarning: 'Y' is deprecated and will be removed in a future version, please use 'YE' instead.
  df.resample("1Y").size().plot()  # The method "resample" is equivalent to the groupby above, though it works on the index;
<ipython-input-6-ce6b6230d4c5>:10: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  df.resample("1M").size().plot()
<ipython-input-6-ce6b6230d4c5>:13: FutureWarning: The 'convention' keyword in DataFrame.resample is deprecated and will be removed in a future version. Explicitly cast PeriodIndex to DatetimeIndex before resampling instead.
  x = df.resample("1M", convention="start").size()
<ipython-input-6-ce6b6230d4c5>:13: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  x = df.resample("1M", convention="start").size()

<Axes: xlabel='Séance'>

print(df.iloc[10])  # Compared to your Excel, pandas are shifted two indexes below, to account for 1. the headers, and
# 2. the fact that Python indexes start at 0
print("_______________")
print(df.loc[df.Année == "2010"])  # You specify a condition, and then obtain a smaller dataframe if you do not care for
# a particular column; notice that the type is important.
# Use the tilde ~ to specify a negative condition df.loc[~(XX) & (XX)]
print("_______________")
print(df.loc[df.Année == 2010][["Type", "Objet"]])  # You get more than one columns by using a double list

Numéro de dossier                                               20100369
Administration         inspecteur d'académie, directeur des services ...
Type                                                                Avis
Année                                                               2010
Séance                                                        28/01/2010
Objet                  La communication du fichier numérique de l'ens...
Thème et sous thème                      Travail Et Emploi/Emploi Public
Mots clés                                                       Carrière
Sens et motivation                             Favorable/Sauf vie privée
Partie                                                               III
Avis                   La secrétaire générale du SGEN-CFDT Midi-Pyrén...
Name: 2010-01-28 00:00:00, dtype: object
_______________
Empty DataFrame
Columns: [Numéro de dossier, Administration, Type, Année, Séance, Objet, Thème et sous thème, Mots clés, Sens et motivation, Partie, Avis]
Index: []
_______________
               Type                                              Objet
Séance                                                                
2010-01-14     Avis  - copie des documents suivants concernant Mons...
2010-01-14     Avis  - la copie du dossier fiscal de la SARL Sparti...
2010-01-14  Conseil  - caractère communicable à X, propriétaire d'u...
2010-01-14     Avis  - communication des documents suivants :\n1) l...
2010-01-14     Avis  - la copie des documents suivants relatifs à l...
...             ...                                                ...
2010-12-21  Conseil  - caractère communicable des tables décennales...
2010-12-21  Conseil  - réponse à apporter à la demande formulée par...
2010-12-21     Avis  - copie, par réalisation à ses frais de ses pr...
2010-12-21     Avis  - communication des documents suivants, relati...
2010-12-21     Avis  - copie de la signature et du paraphe de Maîtr...

[155 rows x 2 columns]

print(df["Sens et motivation"].value_counts())

Sens et motivation
Favorable                                                                                                                                                            13328
Sans objet/Communiqué                                                                                                                                                 6268
Sans objet/Inexistant                                                                                                                                                 1880
Favorable/Sauf vie privée                                                                                                                                             1675
Favorable/Sauf commercial industriel                                                                                                                                  1030
                                                                                                                                                                     ...  
Favorable/Sauf diffusion publique, Favorable/Sauf établissement de document, Sans objet/Communiqué, Favorable/Sauf préparatoire                                          1
Sans objet/Communiqué, Favorable/Sauf vie privée, Favorable/Sauf comportement, Favorable/Sauf appréciation, Défavorable/Comportement, Incompétence/Juridictionnel        1
Favorable/Sauf commercial industriel, Favorable/Sauf juridictionnel                                                                                                      1
Favorable, Irrecevable/Prématurée                                                                                                                                        1
Favorable/Sauf établissement de document, Favorable/Article L311-2 favorable par l'autorité compétente à laquelle vous auriez dû transmettre                             1
Name: count, Length: 4439, dtype: int64

import regex as re

print(df["Sens et motivation"].value_counts())

df["Result"] = ""  # We want to create a column that gives us the result, there are several ways to do it

for index, row in df.iterrows():  # One way would be to iterate over each row and check that the condition is met
    if re.search("défavorable", row["Sens et motivation"], re.S | re.I):
        # Note that we start with défavorable, because the term contains "favorable" in it, which could confuse the regex search
        df.at[index, "Result"] = "Unfavourable"
    elif re.search("favorable", row["Sens et motivation"], re.S | re.I):
        df.at[index, "Result"] = "Favourable"
    else:
        df.at[index, "Result"] = "Unfavourable"

Sens et motivation
Favorable                                                                                                                                                            13328
Sans objet/Communiqué                                                                                                                                                 6268
Sans objet/Inexistant                                                                                                                                                 1880
Favorable/Sauf vie privée                                                                                                                                             1675
Favorable/Sauf commercial industriel                                                                                                                                  1030
                                                                                                                                                                     ...  
Favorable/Sauf diffusion publique, Favorable/Sauf établissement de document, Sans objet/Communiqué, Favorable/Sauf préparatoire                                          1
Sans objet/Communiqué, Favorable/Sauf vie privée, Favorable/Sauf comportement, Favorable/Sauf appréciation, Défavorable/Comportement, Incompétence/Juridictionnel        1
Favorable/Sauf commercial industriel, Favorable/Sauf juridictionnel                                                                                                      1
Favorable, Irrecevable/Prématurée                                                                                                                                        1
Favorable/Sauf établissement de document, Favorable/Article L311-2 favorable par l'autorité compétente à laquelle vous auriez dû transmettre                             1
Name: count, Length: 4439, dtype: int64

def resultsearch(value):  # A second way would be to use function that will do the same,
    # and apply it to the dataframe with ".apply"
    if re.search("défavorable", value, re.S|re.I):
        return "Unfavourable"
    elif re.search("favorable", value, re.S|re.I):
        return "Favourable"
    else:
        return "Unfavourable"

df["Result"] = df["Sens et motivation"].apply(lambda x: resultsearch(x))
# This allows you to apply a function to a value, which is represented by "x" here

print(df.Result.value_counts(normalize=True) * 100)  # We obtain percent of favourable decisions
print("_______________")
df.groupby("Année").Result.value_counts(normalize=True).unstack().plot()
# And then we plot it. See that the low number of entries for earlier years makes it harder to draw conclusions.

Result
Favourable      61.151309
Unfavourable    38.848691
Name: proportion, dtype: float64
_______________

<Axes: xlabel='Année'>

print("There are ", df["Administration"].nunique(), " different values for Administration in the dataset...")


dict_admin = {"[Mm]airie": "Municipal", "[Rr]égion": "Regional", "[dD]epartment": "Department",
        "[Mm]inist|[Pp]réfec?t": "Central"}  # We create a that will link some regexes to synthetise values. For instance,
# we can expect that all values having the term "[Mm]inist" pertain to the central administration

def apply_type(value): # The function will loop over the dict's keys
    for key in dict_admin:
        if re.search(key, value.strip(), re.S | re.I):
            return dict_admin[key]
    return "Other"

df["Admin"] = df.Administration.astype(str).apply(lambda x: apply_type(x)) # We apply the function to the Administration column,
# creating a new "Admin" column with synthetised value
print(df.Admin.value_counts()) # And now we check how many rows we managed to synthetise - e.g., how many "Other" remain
print("_______________")

# Since there are still many "Other", we return to the data to check what they refer to in general, trying to find
# Further rules to add to our dictionary
print(df.loc[df.Admin == "Other"].Administration.value_counts()[:20])
# Locating with Python using a condition. You'll note for instance that acronymes with a number are often departemental
# admins, whereas the use of the term "national" indicate central administration

There are  13480  different values for Administration in the dataset...
Admin
Other        24731
Municipal    13624
Central       9355
Regional      2320
Name: count, dtype: int64
_______________
Administration
Direction générale des finances publiques (DGFIP)                              1648
Assistance Publique-Hôpitaux de Paris (AP-HP)                                   615
X, député                                                                       420
La Poste                                                                        211
Direction générale des patrimoines                                              204
Pôle emploi                                                                     152
X, députée                                                                      147
Caisse interprofessionnelle de prévoyance et d'assurance vieillesse (CIPAV)     110
Orange Groupe                                                                   105
Office français de l'immigration et de l'intégration (OFII)                     104
Assistance publique-Hôpitaux de Marseille (AP-HM)                                92
Communauté d'agglomération de Quimperlé                                          84
Rectorat de l'académie de Créteil (AC 94)                                        83
Service départemental d'incendie et de secours du Cher (SDIS 18)                 77
Caisse primaire d'assurance maladie du Rhône (CPAM 69)                           72
Commission nationale de l'informatique et des libertés (CNIL)                    71
Direction générale des douanes et droits indirects (DGDDI)                       68
Office national des forêts (ONF)                                                 64
Rectorat de l'académie de Paris (AC 75)                                          64
Agence nationale de sécurité du médicament et des produits de santé (ANSM)       61
Name: count, dtype: int64

dict_admin = {"[Mm]airie|agglom|commune": "Municipal", "[Rr]égion": "Regional", "[dD]epartment|\d\d\)?$": "Department",
        "[Mm]inist|[Pp]réfec?t|[Ddirection [Gg]énérale|[Nn]ational|[Ff]rançais|[Uu]niversit|[A-Z-]+\)?$": "Central"}
# We add these new rules and terms to our regex, and redo the process

df["Admin"] = df.Administration.astype(str).apply(lambda x: apply_type(x))  # We reapply with broader dict

print(df.Admin.value_counts())  # If the number of others is negligible, we consider that the job is done

Admin
Central       25495
Municipal     15334
Department     6375
Regional       2300
Other           526
Name: count, dtype: int64

import seaborn as sns

sns.set_style("dark")  # Change style of plots with sns
sns.countplot(x="Admin", hue="Result", data=df)   # Seaborn has great plot types such as this simple countplot

<Axes: xlabel='Admin', ylabel='count'>

df.groupby("Admin").Result.value_counts(normalize=True).unstack()
# As you can see, it's a bit harder to get a document from a Central administration,
# and a bit easier to get them from municipalities

cent_fav_per_year = df.groupby(["Année", "Admin"]).Result.value_counts(normalize=True).unstack()[
    "Favourable"].unstack()["Central"] # We extract a Series with the percent of unfav decisions over the years for both the
# Central and Regional administrations
reg_fav_per_year = df.groupby(["Année", "Admin"]).Result.value_counts(normalize=True).unstack()[
    "Unfavourable"].unstack()["Regional"]
print(cent_fav_per_year.dropna())

from scipy import stats # a statistics module

stats.ttest_ind(cent_fav_per_year.dropna().values.tolist(), reg_fav_per_year.dropna().values.tolist(), equal_var=False)
# The t-test function takes only lists of numbers, so we drop the "N/A" values and convert series to lists
# Since the p-value is above 0.5, we cannot conclude that the samples are that different

Année
2010    0.584507
2011    0.635036
2012    0.575342
2013    0.618790
2014    0.601707
2015    0.587328
2016    0.578329
2017    0.527129
2018    0.608139
2019    0.562098
2020    0.619203
2021    0.600551
2022    0.711048
Name: Central, dtype: float64

TtestResult(statistic=9.479602675291718, pvalue=1.5052844858840268e-08, df=18.59372736360388)

ax = df.loc[df.Admin == "Central"].resample("2M").Result.value_counts(normalize=True).unstack()["Favourable"].plot()
# Next we look at the number of decisions per month, focusing on Favourable results.
# We pass it to an object ax that will represent our plot
ax.axvline("2022-04-23", color="red")  # We add a line to indicate the presidential elections' rough date
ax.axvline("2017-04-23", color="red")
ax.axvline("2012-04-23", color="red")

<ipython-input-17-6311b90ab6c7>:1: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  ax = df.loc[df.Admin == "Central"].resample("2M").Result.value_counts(normalize=True).unstack()["Favourable"].plot()

<matplotlib.lines.Line2D at 0x79443260dc10>

df.loc[df.Admin == "Department"].resample("2M").Result.value_counts(normalize=True).unstack()["Favourable"].plot()
# Now the same for departmental elections
plt.axvline("2021-06-29", color="red")
plt.axvline("2015-03-29", color="red")

<ipython-input-18-6944c5a3311a>:1: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  df.loc[df.Admin == "Department"].resample("2M").Result.value_counts(normalize=True).unstack()["Favourable"].plot()

<matplotlib.lines.Line2D at 0x7944322e7110>

df.loc[df.Admin == "Regional"].resample("2M").Result.value_counts(normalize=True).unstack()["Favourable"].plot()
# Now the same for regional elections
plt.axvline("2021-06-29", color="red")
plt.axvline("2015-12-10", color="red")

<ipython-input-19-b86f42cb3edd>:1: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  df.loc[df.Admin == "Regional"].resample("2M").Result.value_counts(normalize=True).unstack()["Favourable"].plot()

<matplotlib.lines.Line2D at 0x7944321ab510>

df["Séance"]

# Second analysis: the average delay to get a CADA decision

df["Séance"] = pd.to_datetime(df["Séance"], format="%d/%m/%Y")
df["Saisine"] = df.Avis.astype(str).str.extract("(\d\d? [a-z]+ \d{4})")
# We create a new column by looking into the text of the avis and extracting a date with a regex pattern

print(df.Saisine.value_counts())  # We check, and this is not ideal, many false dates have inserted themselves
df["Saisine"] = df.Avis.astype(str).str.extract("le (\d\d? [a-z]{3,} \d{4})")
# One better way is to check for dates starting by "le", since most avis refer to a courrier "envoyé le"
print("______________")
print(df.Saisine.value_counts()) # This is better

french_to_english_months = {
    "janvier": "January",
    "février": "February",
    "mars": "March",
    "avril": "April",
    "mai": "May",
    "juin": "June",
    "juillet": "July",
    "août": "August",
    "septembre": "September",
    "octobre": "October",
    "novembre": "November",
    "décembre": "December"
}
df["Saisine"] = df["Saisine"].replace(french_to_english_months, regex=True) # We will need the date to be in proper English, not French

Saisine
17 juillet 1978      1579
8 juillet 2017        570
39 et 2019            359
19 et 2020            337
28 mai 1982           263
                     ... 
8 avril 2011            1
13 septembre 1995       1
30 octobre 2009         1
14 mai 2012             1
22 mars 2022            1
Name: count, Length: 3631, dtype: int64
______________
Saisine
8 juillet 2017      569
27 juillet 2020     184
9 septembre 2021    172
12 juillet 2021     121
24 juillet 2017     112
                   ... 
4 novembre 2005       1
16 mars 2008          1
4 mai 2011            1
3 mars 1991           1
22 mars 2022          1
Name: count, Length: 2905, dtype: int64

df["DateS"] = pd.to_datetime(df["Saisine"], format="%d %B %Y", errors="coerce")
# Then we can convert the dates found to a datetime object; we "coerce" to ignore errors (wrong format, etc.)

df["Delta"] = df["Séance"] - df["DateS"]  # We obtain the delta between the two dates
df["Days"] = df.Delta.apply(lambda x: x.days)
# And then fetch the number of days from the datetime values in the delta column. Again, the function apply allows
# you to work on the data in the columns immediately. Since the Delta data is datetime, we can get the .days attribute

df.loc[(df.Days > 0) & (df.Days < 300)].resample("3M").Days.mean().plot()  # Finally, we plot the results of our dataframe,
# removing outliers (negative deltas and deltas over 300 days)

<ipython-input-19-8464b3c73d63>:9: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  df.loc[(df.Days > 0) & (df.Days < 300)].resample("3M").Days.mean().plot()  # Finally, we plot the results of our dataframe,

<Axes: xlabel='Séance'>

	Numéro de dossier	Année
count	5.298200e+04	52982.000000
mean	2.016655e+07	2016.546903
std	4.107314e+04	4.165896
min	1.984000e+07	1984.000000
25%	2.014457e+07	2014.000000
50%	2.017288e+07	2017.000000
75%	2.019465e+07	2020.000000
max	2.022194e+07	2022.000000

Année	1984	1985	1986	1987	1988	1989	1990	1991	1992	1993	...	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022
Type
Avis	2.0	1.0	NaN	2.0	2.0	1.0	3.0	4.0	6.0	6.0	...	4618.0	4339.0	4833.0	4450.0	5573.0	3929.0	5010.0	6970.0	5935.0	1284.0
Conseil	NaN	NaN	1.0	NaN	NaN	1.0	2.0	2.0	5.0	8.0	...	159.0	138.0	212.0	262.0	232.0	312.0	281.0	155.0	164.0	91.0
Sanction	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Séance
Séance
2010-01-14	14/01/2010
2010-01-14	14/01/2010
2010-01-14	14/01/2010
2010-01-14	14/01/2010
2010-01-14	14/01/2010
...	...
2022-04-21	21/04/2022
2022-04-21	21/04/2022
2022-04-21	21/04/2022
2022-05-12	12/05/2022
2022-05-12	12/05/2022

Analysing a Dataset

Preparation¶

Datasets¶

Explore a dataset¶

Synthetise Data¶

Analyses¶