from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

import requests  # Scraping module
from bs4 import BeautifulSoup  # HTML reading module
import regex as re

import requests  # Scraping module
from bs4 import BeautifulSoup  # HTML reading module
import regex as re

webpage = requests.get("https://www.hec.edu/fr/grande-ecole-masters/ms-et-msc/ms/llm-droit-et-management-international/programme")
# We fetch the webpage and pass it to an object
print(webpage.status_code)  # The webpage object comes with distinct methods, such as status code,
# which tells you if connection was successful: 404 means no
soup = BeautifulSoup(webpage.content)  # We then read the html (which is put as a string in
# webpage.content) with BeautifulSoup, and pass it to an object we'll call "soup"
prix_ao = soup.find(title="Prix Juridique et Fiscal Allen & Overy")  # Using that soup object,
# we look for an element whose title (the html attribute) matches the one we are looking for
content = prix_ao.parent.parent.text  # Next, we can use this element to get the element
# we are actually interested in, which here is the text of the grandparent (.parent.parent)
print(content)

200


      Prix Juridique et Fiscal Allen & Overy
      

Le Prix Juridique et Fiscal a été créé par Allen & Overy et HEC Paris en 2004, à la suite de l’adhésion d’Allen & Overy à la Fondation HEC.
Chaque année, Allen & Overy récompense trois travaux de recherche (Mastère Spécialisé/LLM Droit et Management International ou Majeure Stratégie Fiscale et Juridique Internationale) dans le cadre du Prix Juridique & Fiscal. Pour postuler, l’étudiant doit obtenir la note A à sa thèse professionnelle et avoir le soutien du Directeur scientifique et du Directeur exécutif du Mastère Spécialisé/LLM.

prix_ao.text

'\n      Prix Juridique et Fiscal Allen & Overy\n      \n'

webpage = requests.get("https://www.conseil-constitutionnel.fr/le-bloc-de-constitutionnalite/texte-integral-de-la-constitution-du-4-octobre-1958-en-vigueur")
# Same logic, we first get the page
soup = BeautifulSoup(webpage.content)  # Then create a soup

el = soup.find("h2", id="preambule")
print(el)

<h2 data-summary="PRÉAMBULE" id="preambule">PRÉAMBULE</h2>

webpage = requests.get("https://www.conseil-constitutionnel.fr/le-bloc-de-constitutionnalite/texte-integral-de-la-constitution-du-4-octobre-1958-en-vigueur")
# Same logic, we first get the page
soup = BeautifulSoup(webpage.content)  # Then create a soup

cons_div = soup.find("div", class_="field field--name-field-reference-paragraph field--type-entity-reference-revisions field--label-hidden field__items")
# We look for the main element containing the entire constitution. Note that, by contrast with all
# other attributes, "class" needs to have an underscore at the end ("class_");
# this is because "class" already means something in native python
print(len([x for x in cons_div.descendants])) # how many indirect children in cons_div

1407

for child in cons_div.findChildren("h3"):  # Looking for titles; note that all 'find' methods
     # in Beautifulsoup work from the point of view of the element you use it on, so here it is
     # looking for all children of cons_div - not of the whole soup
    print(child.text)

ARTICLE PREMIER.
ARTICLE 2.
ARTICLE 3.
ARTICLE 4.
ARTICLE 5.
ARTICLE 6.
ARTICLE 7.
ARTICLE 8.
ARTICLE 9.
ARTICLE 10.
ARTICLE 11.
ARTICLE 12.
ARTICLE 13.
ARTICLE 14.
ARTICLE 15.
ARTICLE 16.
ARTICLE 17.
ARTICLE 18.
ARTICLE 19.
ARTICLE 20.
ARTICLE 21.
ARTICLE 22.
ARTICLE 23.
ARTICLE 24.
ARTICLE 25.
ARTICLE 26.
ARTICLE 27.
ARTICLE 28.
ARTICLE 29.
ARTICLE 30.
ARTICLE 31.
ARTICLE 32.
ARTICLE 33.
ARTICLE 34.
ARTICLE 34-1.
ARTICLE 35.
ARTICLE 36.
ARTICLE 37.
ARTICLE 37-1.
ARTICLE 38.
ARTICLE 39.
ARTICLE 40.
ARTICLE 41.
ARTICLE 42.
ARTICLE 43.
ARTICLE 44.
ARTICLE 45.
ARTICLE 46.
ARTICLE 47.
ARTICLE 47-1.
ARTICLE 47-2.
ARTICLE 48.
ARTICLE 49.
ARTICLE 50.
ARTICLE 50-1.
ARTICLE 51.
ARTICLE 51-1.
ARTICLE 51-2.
ARTICLE 52.
ARTICLE 53.
ARTICLE 53-1.
ARTICLE 53-2.
ARTICLE 54.
ARTICLE 55.
ARTICLE 56.
ARTICLE 57.
ARTICLE 58.
ARTICLE 59.
ARTICLE 60.
ARTICLE 61.
ARTICLE 61-1.
ARTICLE 62.
ARTICLE 63.
ARTICLE 64.
ARTICLE 65.
ARTICLE 66.
ARTICLE 66-1.
ARTICLE 67.
ARTICLE 68.
ARTICLE 68-1.
ARTICLE 68-2.
ARTICLE 68-3.
ARTICLE 69.
ARTICLE 70.
ARTICLE 71.
ARTICLE 71-1.
ARTICLE 72.
ARTICLE 72-1.
ARTICLE 72-2.
ARTICLE 72-3.
ARTICLE 72-4.
ARTICLE 73.
ARTICLE 74.
ARTICLE 74-1.
ARTICLE 75.
ARTICLE 75-1.
ARTICLE 76.
ARTICLE 77.
ARTICLE 87.
ARTICLE 88.
ARTICLE 88-1.
ARTICLE 88-2.
ARTICLE 88-3.
ARTICLE 88-4.
ARTICLE 88-5.
ARTICLE 88-6.
ARTICLE 88-7
ARTICLE 89.

dic_constitution = {}
for child in cons_div.findChildren("h3"):  # We go over every article title
    text = ""  # We create an empty variable to fill with the text
    article_num = re.search(r"\d+(-\d+)?|PREMIER", child.text).group()  # We get the article
    # number from regex; notice that some numbers are of the pattern \d+-\d (e.g., Article 88-2),
    # so we provide for this; the first article is also an exception
    for sib in child.find_all_next(["h3", "p"]):  # We iterate over the next elements
        if sib.name == "h3":  # We check if we have  reached the next article, in which case we
            # break the loop
            break
        else:  # If we have not reached the next article, we add the text of the element 'p'
            # to our variable, separated  by a line-break
            text += "\n" + sib.text.strip()  # Strip because online text often has empty strings
            # at the end and beginning of text
    dic_constitution[article_num] = text.strip()  # Once the loop over the text elements is over,
    # we input it in our dictionary
for x in range(2, 5):
    print("L'article ", str(x), "de la Constitution est: \n", dic_constitution[str(x)])

L'article  2 de la Constitution est: 
 La langue de la République est le français.
L'emblème national est le drapeau tricolore, bleu, blanc, rouge.
L'hymne national est « La Marseillaise ».
La devise de la République est « Liberté, Égalité, Fraternité ».
Son principe est : gouvernement du peuple, par le peuple et pour le peuple.
L'article  3 de la Constitution est: 
 La souveraineté nationale appartient au peuple qui l'exerce par ses représentants et par la voie du référendum.
Aucune section du peuple ni aucun individu ne peut s'en attribuer l'exercice.
Le suffrage peut être direct ou indirect dans les conditions prévues par la Constitution. Il est toujours universel, égal et secret.
Sont électeurs, dans les conditions déterminées par la loi, tous les nationaux français majeurs des deux sexes, jouissant de leurs droits civils et politiques.
L'article  4 de la Constitution est: 
 Les partis et groupements politiques concourent à l'expression du suffrage. Ils se forment et exercent leur activité librement. Ils doivent respecter les principes de la souveraineté nationale et de la démocratie.
Ils contribuent à la mise en œuvre du principe énoncé au second alinéa de l'article 1er dans les conditions déterminées par la loi.
La loi garantit les expressions pluralistes des opinions et la participation équitable des partis et groupements politiques à la vie démocratique de la Nation.

dic_constitution["17"]

# Your Code here

# Get the website data + soup
webpage = requests.get("https://www.conseil-constitutionnel.fr/le-bloc-de-constitutionnalite/texte-integral-de-la-constitution-du-4-octobre-1958-en-vigueur")
# Same logic, we first get the page
soup = BeautifulSoup(webpage.content)  # Then create a soup

# Get the topmost element with the constitution
cons_div = soup.find("div", class_="field field--name-field-reference-paragraph field--type-entity-reference-revisions field--label-hidden field__items")
longest_section = 0

# Locate subsections
for child in cons_div.findChildren("h2"):
    ll = [] # We go over every article title
    text = ""  # We create an empty variable to fill with the text # We get the article
    # number from regex; notice that some numbers are of the pattern \d+-\d (e.g., Article 88-2),
    # so we provide for this; the first article is also an exception
    for sib in child.find_all_next(["h2", "h3"]):  # We iterate over the next elements
        if sib.name == "h2":  # We check if we have  reached the next article, in which case we
            # break the loop
            break
        else:  # If we have not reached the next article, we add the text of the element 'p'
            # to our variable, separated  by a line-break
            ll.append(sib)  # Strip because online text often has empty strings
            # at the end and beginning of text
    if len(ll) > longest_section:
        longest_section = len(ll)
        print(child)

    # we input it in our dictionary

<h2 data-summary="PRÉAMBULE" id="preambule">PRÉAMBULE</h2>
<h2 data-summary="Titre premier - DE LA SOUVERAINETÉ" id="titre_premier_de_la_souverainete">Titre premier - DE LA SOUVERAINETÉ</h2>
<h2 data-summary="Titre II - LE PRÉSIDENT DE LA RÉPUBLIQUE" id="titre_ii_le_president_de_la_republique">Titre II - LE PRÉSIDENT DE LA RÉPUBLIQUE</h2>
<h2 data-summary="Titre V - DES RAPPORTS ENTRE LE PARLEMENT ET LE GOUVERNEMENT" id="titre_v_des_rapports_entre_le_parlement_et_le_gouvernement">Titre V - DES RAPPORTS ENTRE LE PARLEMENT ET LE GOUVERNEMENT</h2>

longest_section

25

# Your Code here

webpage = requests.get("https://www.conseil-etat.fr/arianeweb/#/recherche")
# We try to get the conseil d'Etat research page
soup = BeautifulSoup(webpage.content)  # We create a soup element on that basis

table = soup.find("table")  # We look for the table with documents to download
print(table)  # Uh oh

None

import regex as re
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
# (remember to use pip install X if you don't have module X)
from selenium.webdriver.common.keys import Keys as KeysBrowser
import pandas as pd
import time

driver = webdriver.Chrome()
# This launches the browser
driver.get("https://www.conseil-etat.fr/arianeweb/#/recherche") # We go to the CE's database
# landing page

# looking at the .html in a browser > Inspect, we see the relevant checkbow has an
# attribute 'ng-change' with value 'sources.selectSource('AW_DCE')'
att = "sources.selectSource('AW_DCE')"
# Because of the way that quotes and double-quotes interact,
# we need to put the value of that attribute in a variable

el = driver.find_element(By.XPATH, r'.//*[@ng-change="' + att + '"]')
# And then look at that attribute like this - note that before closing the first string,
# we have the double-quotes, which are closed at the beginning of the second string
el = driver.find_element(By.XPATH, r'.//*[contains(text(), "Décisions du Conseil")]')
# An alternative is to realise that the relevant checkbox has this unique text,
# so we use xPath to get button
el.click()  # We click on the element we are interested in

button = driver.find_element(By.XPATH, ".//button[@class='btn btn-primary']")
# Next we need to click on "Rechercher"; Inspect tool tells us that this is how to find it
button.click()  # and then on the button, which discloses the table

soup = BeautifulSoup(driver.page_source)  # We recreate a soup, now that the page source
# has changed, since new content has been dynamically added
table = soup.find_all("table")[-1]  # Collect tables from the page; there are two of them
# in the page source (and find_all returns a list), and we are interested in the last one.

df = pd.read_html(str(table))[-1]  # To make things easier, we convert the table in a panda
# dataframe, with each row storing data about one decision. Note the 'str' command:
# it's because the original table is a BeautifulSoup object, and not a string.
# This method returns a list of dataframes, so make sure to select the last one
df.head(10)  # Always a good idea to see what the dataframe look like

for index, row in df.iterrows():  # For each row, we'll make the browser click on the element
    # and collect the judgment
    num = re.search(r"\d+", row["Numéro d'affaire"]).group()  # Taking only the number because
    # the (...) messes up xPath
    row_el = driver.find_element(By.XPATH, ".//td[contains(text(), '" + num + "')]")
    # With that num, we look for the relevant element in browser
    row_el.click()  # load page with judgment
    time.sleep(1)  # Giving the page time (2s) to load before changing focus with function
    # time.sleep (imported above)
    driver.switch_to.window(driver.window_handles[-1])
    ## Switch the driver's focus to the window you just opened, with method "switch to",
    # and argument the relevant window from the list of window_handles
    # (latest loaded window will be -1)
    ave_el = driver.find_element(By.CSS_SELECTOR, "button[title='enregistre le document']")
    # Find the download button, using CSS selector here so as to rely on the unique 'title'
    ave_el.click()  # Downloading the judgment, in html format; it will end up in
    # your normal Download folder
    driver.switch_to.window(driver.window_handles[0])
    # Important to return to main window, or the next search for rowel won't work

LL = []  # We create a list that will take our dataframes, page by page
for x in range(2500):
    soup = BeautifulSoup(driver.page_source)  # we create a soup, page by page
    table = soup.find_all("table")[-1]  # Find that table from that soup
    LL.append(pd.read_html(str(table))[-1]  ) # Put the resulting dataframe in our list
    el = driver.find_elements(By.XPATH, ".//a[@ng-click='selectPage(page + 1)']")[-1]
    # Find the element with the arrow to change page
    el.click() # Click on it
df = pd.concat(LL) # Create the large dataframe based on list of smaller dfs
df.index = pd.to_datetime(df["Date de lecture"])  # Put index as a date
ax = df.resample("6M")["Code de publication"].value_counts().unstack().plot()
# Plot the number of cases per batches of six months

Scraping

Preparation¶

Scraping¶

Requests¶

Exercice 1¶

Exercise 2¶

Selenium¶