!pip install gdown==v4.6.3
!pip install eli5
!pip install --upgrade spacy
!pip install spacytextblob
import gdown
import os
# Google Drive URL for the .txt file
gdrive_cc = "https://drive.google.com/uc?id=1QGtE_ZFFEmzNxtvI5YlrqaOEflnaVvCx"
gdrive_cass = "https://drive.google.com/uc?id=10Z0PTGm8QoHMbCEp4YEBCYteaRm_3Gs_"

# Download the .txt file into the target directory
gdown.download(gdrive_cc, "CC.csv", quiet=False)
gdown.download(gdrive_cass, "CassTexts.csv", quiet=False)

Requirement already satisfied: gdown==v4.6.3 in /usr/local/lib/python3.11/dist-packages (4.6.3)
Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from gdown==v4.6.3) (3.17.0)
Requirement already satisfied: requests[socks] in /usr/local/lib/python3.11/dist-packages (from gdown==v4.6.3) (2.32.3)
Requirement already satisfied: six in /usr/local/lib/python3.11/dist-packages (from gdown==v4.6.3) (1.17.0)
Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from gdown==v4.6.3) (4.67.1)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (from gdown==v4.6.3) (4.12.3)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->gdown==v4.6.3) (2.6)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown==v4.6.3) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown==v4.6.3) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown==v4.6.3) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown==v4.6.3) (2024.12.14)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.11/dist-packages (from requests[socks]->gdown==v4.6.3) (1.7.1)
Requirement already satisfied: eli5 in /usr/local/lib/python3.11/dist-packages (0.13.0)
Requirement already satisfied: attrs>17.1.0 in /usr/local/lib/python3.11/dist-packages (from eli5) (25.1.0)
Requirement already satisfied: jinja2>=3.0.0 in /usr/local/lib/python3.11/dist-packages (from eli5) (3.1.5)
Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.11/dist-packages (from eli5) (1.23.5)
Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from eli5) (1.13.1)
Requirement already satisfied: six in /usr/local/lib/python3.11/dist-packages (from eli5) (1.17.0)
Requirement already satisfied: scikit-learn>=0.20 in /usr/local/lib/python3.11/dist-packages (from eli5) (1.6.1)
Requirement already satisfied: graphviz in /usr/local/lib/python3.11/dist-packages (from eli5) (0.20.3)
Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.11/dist-packages (from eli5) (0.9.0)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2>=3.0.0->eli5) (3.0.2)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.20->eli5) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.20->eli5) (3.5.0)
Requirement already satisfied: spacy in /usr/local/lib/python3.11/dist-packages (3.8.4)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.11/dist-packages (from spacy) (3.0.12)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from spacy) (1.0.5)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.11/dist-packages (from spacy) (1.0.12)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.11/dist-packages (from spacy) (2.0.11)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.11/dist-packages (from spacy) (3.0.9)
Requirement already satisfied: thinc<8.4.0,>=8.3.4 in /usr/local/lib/python3.11/dist-packages (from spacy) (8.3.4)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.11/dist-packages (from spacy) (1.1.3)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.11/dist-packages (from spacy) (2.5.1)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.11/dist-packages (from spacy) (2.0.10)
Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /usr/local/lib/python3.11/dist-packages (from spacy) (0.4.1)
Requirement already satisfied: typer<1.0.0,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from spacy) (0.15.1)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.11/dist-packages (from spacy) (4.67.1)
Requirement already satisfied: numpy>=1.19.0 in /usr/local/lib/python3.11/dist-packages (from spacy) (1.23.5)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from spacy) (2.32.3)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.11/dist-packages (from spacy) (2.10.6)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from spacy) (3.1.5)
Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from spacy) (75.1.0)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from spacy) (24.2)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.11/dist-packages (from spacy) (3.5.0)
Requirement already satisfied: language-data>=1.2 in /usr/local/lib/python3.11/dist-packages (from langcodes<4.0.0,>=3.2.0->spacy) (1.3.0)
Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.7.0)
Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.27.2)
Requirement already satisfied: typing-extensions>=4.12.2 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.12.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2024.12.14)
Requirement already satisfied: blis<1.3.0,>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from thinc<8.4.0,>=8.3.4->spacy) (1.2.0)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.11/dist-packages (from thinc<8.4.0,>=8.3.4->spacy) (0.1.5)
Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy) (8.1.8)
Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy) (1.5.4)
Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy) (13.9.4)
Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from weasel<0.5.0,>=0.1.0->spacy) (0.20.0)
Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /usr/local/lib/python3.11/dist-packages (from weasel<0.5.0,>=0.1.0->spacy) (7.1.0)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->spacy) (3.0.2)
Requirement already satisfied: marisa-trie>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy) (1.2.1)
Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (3.0.0)
Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (2.18.0)
Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy) (1.17.2)
Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (0.1.2)
Requirement already satisfied: spacytextblob in /usr/local/lib/python3.11/dist-packages (5.0.0)
Requirement already satisfied: spacy>=3.0.0 in /usr/local/lib/python3.11/dist-packages (from spacytextblob) (3.8.4)
Requirement already satisfied: textblob>=0.18.0.post0 in /usr/local/lib/python3.11/dist-packages (from spacytextblob) (0.19.0)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (3.0.12)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (1.0.5)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (1.0.12)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (2.0.11)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (3.0.9)
Requirement already satisfied: thinc<8.4.0,>=8.3.4 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (8.3.4)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (1.1.3)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (2.5.1)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (2.0.10)
Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (0.4.1)
Requirement already satisfied: typer<1.0.0,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (0.15.1)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (4.67.1)
Requirement already satisfied: numpy>=1.19.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (1.23.5)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (2.32.3)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (2.10.6)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (3.1.5)
Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (75.1.0)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (24.2)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (3.5.0)
Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.11/dist-packages (from textblob>=0.18.0.post0->spacytextblob) (3.9.1)
Requirement already satisfied: language-data>=1.2 in /usr/local/lib/python3.11/dist-packages (from langcodes<4.0.0,>=3.2.0->spacy>=3.0.0->spacytextblob) (1.3.0)
Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk>=3.9->textblob>=0.18.0.post0->spacytextblob) (8.1.8)
Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk>=3.9->textblob>=0.18.0.post0->spacytextblob) (1.4.2)
Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk>=3.9->textblob>=0.18.0.post0->spacytextblob) (2024.11.6)
Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3.0.0->spacytextblob) (0.7.0)
Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3.0.0->spacytextblob) (2.27.2)
Requirement already satisfied: typing-extensions>=4.12.2 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3.0.0->spacytextblob) (4.12.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.0.0->spacytextblob) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.0.0->spacytextblob) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.0.0->spacytextblob) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.0.0->spacytextblob) (2024.12.14)
Requirement already satisfied: blis<1.3.0,>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from thinc<8.4.0,>=8.3.4->spacy>=3.0.0->spacytextblob) (1.2.0)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.11/dist-packages (from thinc<8.4.0,>=8.3.4->spacy>=3.0.0->spacytextblob) (0.1.5)
Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy>=3.0.0->spacytextblob) (1.5.4)
Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy>=3.0.0->spacytextblob) (13.9.4)
Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from weasel<0.5.0,>=0.1.0->spacy>=3.0.0->spacytextblob) (0.20.0)
Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /usr/local/lib/python3.11/dist-packages (from weasel<0.5.0,>=0.1.0->spacy>=3.0.0->spacytextblob) (7.1.0)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->spacy>=3.0.0->spacytextblob) (3.0.2)
Requirement already satisfied: marisa-trie>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy>=3.0.0->spacytextblob) (1.2.1)
Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3.0.0->spacytextblob) (3.0.0)
Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3.0.0->spacytextblob) (2.18.0)
Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy>=3.0.0->spacytextblob) (1.17.2)
Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3.0.0->spacytextblob) (0.1.2)

Downloading...
From: https://drive.google.com/uc?id=1QGtE_ZFFEmzNxtvI5YlrqaOEflnaVvCx
To: /content/CC.csv
100%|██████████| 4.29M/4.29M [00:00<00:00, 25.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=10Z0PTGm8QoHMbCEp4YEBCYteaRm_3Gs_
To: /content/CassTexts.csv
100%|██████████| 32.8M/32.8M [00:00<00:00, 72.4MB/s]

'CassTexts.csv'

!python -m spacy download fr_core_news_md
!python -m spacy download en_core_web_lg

Collecting fr-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.8.0/fr_core_news_md-3.8.0-py3-none-any.whl (45.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 45.8/45.8 MB 10.8 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('fr_core_news_md')
⚠ Restart to reload dependencies
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 400.7/400.7 MB 1.4 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')
⚠ Restart to reload dependencies
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

!pip install pandas==2.2.2
!pip install numpy==1.23.5

Requirement already satisfied: pandas==2.2.2 in /usr/local/lib/python3.11/dist-packages (2.2.2)
Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas==2.2.2) (1.23.5)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas==2.2.2) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas==2.2.2) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas==2.2.2) (2025.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas==2.2.2) (1.17.0)
Requirement already satisfied: numpy==1.23.5 in /usr/local/lib/python3.11/dist-packages (1.23.5)

import pandas as pd
import regex as re
import spacy
from collections import Counter
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


nlp = spacy.load("fr_core_news_md", disable=["ner", "textcat"])  # We download spacy in a nlp object; the disable function is for efficiency purposes
# (some bits of Spacy are slow and useless for our purposes). Models need to be downloaded from the terminal

df = pd.read_csv("CC.csv", header="infer")  # Working with a dataset of versions of the Code civil since 1804

text = "\n".join(df.loc[df.version == 1]["Text"].values.tolist())
doc = nlp(text)
words_doc = [tok.text for tok in doc if tok.is_alpha and not tok.is_punct]
CW = Counter(words_doc)
plt.figure(figsize=(10,10))  #to increase the plot resolution
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.xticks(rotation=90)    #to rotate x-axis values
for word, freq in CW.most_common(30):
    plt.bar(word, freq)
plt.show()

# Now Benford's law

numbers = re.findall("[1-9](?=\d|\-|$)", "".join(df.Art.values.tolist()))
CC = Counter(numbers)
plt.figure(figsize=(10,10))  #to increase the plot resolution
plt.ylabel("Frequency")
plt.xlabel("Number")
plt.xticks(rotation=90)    #to rotate x-axis values

for key, value in CC.most_common(9):
    plt.bar(key, value)
plt.show()

# And now in French

roi = nlp("roi")

def close_words_from_vector(vec):  # A function that returns the 10 most similar words compared to "vec"
    ms = nlp.vocab.vectors.most_similar(np.array([vec]), n=10)
    return [nlp.vocab.strings[w] for w in ms[0][0]]

print(close_words_from_vector(roi.vector))

analogie = nlp("roi").vector - nlp("homme").vector + nlp("femme").vector  # We find the vector that corresponds to roi, minus homme, plus femme
print(close_words_from_vector(analogie))  # Works only with large model

article_1382 = "Tout fait quelconque de l'homme, qui cause à autrui un dommage, oblige celui par la faute duquel il est arrivé à le réparer."
article_1383 = "On est responsable non seulement du dommage que l'on cause par son propre fait, mais encore de celui qui est causé par le fait des personnes dont on doit répondre, ou des choses que l'on a sous sa garde. Toutefois, celui qui détient, à un titre quelconque, tout ou partie de l'immeuble ou des biens mobiliers dans lesquels un incendie a pris naissance ne sera responsable, vis-à-vis des tiers, des dommages causés par cet incendie que s'il est prouvé qu'il doit être attribué à sa faute ou à la faute des personnes dont il est responsable. Cette disposition ne s'applique pas aux rapports entre propriétaires et locataires, qui demeurent régis par les articles 1733 et 1734 du code civil. Le père et la mère, en tant qu'ils exercent l'autorité parentale, sont solidairement responsables du dommage causé par leurs enfants mineurs habitant avec eux. Les maîtres et les commettants, du dommage causé par leurs domestiques et préposés dans les fonctions auxquelles ils les ont employés ; Les instituteurs et les artisans, du dommage causé par leurs élèves et apprentis pendant le temps qu'ils sont sous leur surveillance. La responsabilité ci-dessus a lieu, à moins que les père et mère et les artisans ne prouvent qu'ils n'ont pu empêcher le fait qui donne lieu à cette responsabilité.En ce qui concerne les instituteurs, les fautes, imprudences ou négligences invoquées contre eux comme ayant causé le fait dommageable, devront être prouvées, conformément au droit commun, par le demandeur, à l'instance."

doc = nlp(article_1382)  # We transform the text in a spacy object
for tok in doc:  # Iterating through each token in the text
    if tok.is_stop is False and tok.is_punct is False:  # We check that the token is not a "stop word", and is not pure punctuation
        print(tok, tok.lemma_, tok.dep_, tok.pos_)  # In that case, we print the token text, its lemma, its place in text, etc
    else:
        print(tok.text)  # Otherwise we just print the text


doc2 = nlp(article_1383)  # This article has several sentences, so we can use Spacy to split them; each sentence become a full list of tokens, but keeps attributes from full text (like when does it start)
for sent in doc2.sents:
    print(sent, sent.start)  # We print each sentence
    verbs = []
    for tok in sent:  # For each sentence, we go token by token
        if tok.pos_ == "VERB":  # We check if that token is a subject, and then we print it
            verbs.append(tok.text + "=" + tok.lemma_)
    print("VERBS: ", verbs)

    nouns = []
    for chunk in sent.noun_chunks:  # We print the groupe nominaux
        nouns.append(chunk)
    print("NAME GROUPS: ", nouns)

Tout
fait
quelconque
de
l'
homme homme nmod NOUN
,
qui
cause cause acl:relcl NOUN
à
autrui
un
dommage dommage obl:arg NOUN
,
oblige obliger advcl VERB
celui
par
la
faute faute obl:agent NOUN
duquel
il
est
arrivé arriver ROOT VERB
à
le
réparer réparer xcomp VERB
.
On est responsable non seulement du dommage que l'on cause par son propre fait, mais encore de celui qui est causé par le fait des personnes dont on doit répondre, ou des choses que l'on a sous sa garde. 0
VERBS:  ['cause=cause', 'causé=causer', 'doit=devoir', 'répondre=répondre', 'a=avoir']
NAME GROUPS:  [On, dommage, l', on, son propre fait, encore de celui, qui, le fait, des personnes, on, , ou des choses, que, l', on, sa garde]
Toutefois, celui qui détient, à un titre quelconque, tout ou partie de l'immeuble ou des biens mobiliers dans lesquels un incendie a pris naissance ne sera responsable, vis-à-vis des tiers, des dommages causés par cet incendie que s'il est prouvé qu'il doit être attribué à sa faute ou à la faute des personnes dont il est responsable. 44
VERBS:  ['détient=détenir', 'pris=prendre', 'responsable=responsable', 'causés=causer', 'prouvé=prouver', 'doit=devoir', 'attribué=attribuer', 'responsable=responsable']
NAME GROUPS:  [qui, un titre quelconque, , tout ou partie de l'immeuble, des biens, lesquels, un incendie, naissance, cet incendie, sa faute, à la faute, personnes, il]
Cette disposition ne s'applique pas aux rapports entre propriétaires et locataires, qui demeurent régis par les articles 1733 et 1734 du code civil. 110
VERBS:  ['applique=applique', 'demeurent=demeurer', 'régis=régir']
NAME GROUPS:  [Cette disposition, rapports, propriétaires, locataires, qui, les articles, code, civil]
Le père et la mère, en tant qu'ils exercent l'autorité parentale, sont solidairement responsables du dommage causé par leurs enfants mineurs habitant avec eux. 136
VERBS:  ['exercent=exercer', 'causé=causer', 'habitant=habiter']
NAME GROUPS:  [Le père, la mère, ils, l'autorité parentale, dommage, leurs enfants mineurs, eux]
Les maîtres et les commettants, du dommage causé par leurs domestiques et préposés dans les fonctions auxquelles ils les ont employés ; 165
VERBS:  ['causé=causer', 'préposés=préposer']
NAME GROUPS:  [Les maîtres, les commettants, dommage, leurs domestiques, les fonctions, ils, les]
Les instituteurs et les artisans, du dommage causé par leurs élèves et apprentis pendant le temps qu'ils sont sous leur surveillance. 188
VERBS:  ['causé=causer', 'apprentis=apprenti']
NAME GROUPS:  [Les instituteurs, les artisans, dommage, leurs élèves, le temps, ils]
La responsabilité ci-dessus a lieu, à moins que les père et mère et les artisans ne prouvent qu'ils n'ont pu empêcher le fait qui donne lieu à cette responsabilité. 212
VERBS:  ['a=avoir', 'prouvent=prouver', 'pu=pouvoir', 'empêcher=empêcher', 'donne=donne']
NAME GROUPS:  [La responsabilité, ci, -, dessus, lieu, les père, mère, les artisans, ils, le fait, qui, lieu, cette responsabilité]
En ce qui concerne les instituteurs, les fautes, imprudences ou négligences invoquées contre eux comme ayant causé le fait dommageable, devront être prouvées, conformément au droit commun, par le demandeur, à l'instance. 247
VERBS:  ['concerne=concerner', 'invoquées=invoquer', 'causé=causer', 'devront=devoir', 'prouvées=prouver']
NAME GROUPS:  [ce qui concerne les instituteurs, les fautes, , imprudences, négligences, eux, le fait dommageable, droit commun, le demandeur, l'instance]

df.index = pd.to_datetime(df.Date, format="mixed")  # Change the index to the date of the article, so as to do time series
df = df.sort_index()

def spacy_process(text):  # We first prepare the text by using spacy's token elements to remove stop words and punctuation
    doc = nlp(text)   # We transform the text with spacy
    filtered_sentence = []   # Empty list for the tokens we'll want to keep
    punctuations = ["?",":","!",".",",",";","-", "(",")"]  # A list of punctuation
    banned_words = ["ARTICLE", "CODE"]  # A list of words we are not interested in, because they are very frequent
    for token in doc:
        if token.is_stop is False and token.lemma_ not in punctuations and token.text.upper() not in banned_words:  # We append tokens to the list only if they are not a stop word or in our list of punctuations
            filtered_sentence.append(token.lemma_)

    return " ".join(filtered_sentence)


df["CText"] = df.Text.map(lambda x: spacy_process(x))  # We apply it to our dataframe, so as to have a new column with cleaned text, rid of stop words and punctuations


def get_code_by_date(db, end):  # The idea is to sort by version (higher number > newer text) as the dictionary will take the last result
    data = []
    tempdict = db[:end].set_index("Art").sort_values(by="version", ascending=True).to_dict()["version"]
    for index, row in db[:end].iterrows():
        if row.Art in tempdict and row["version"] == tempdict[row["Art"]]:
            data.append(row.values)
    return pd.DataFrame(data, columns=db.columns)

fig, ax = plt.subplots(2,2, figsize=(15,15))  # We initialise a set of subplots
fig.suptitle('Most common words for given years of Code civil')  # Which we entitle
for e, year in enumerate(["1805", "1950", "2000", "2021"]):  # A selection of years
    print(year)
    ax.ravel()[e].set_title("Most Common - " + year)
    aggregate_counter = Counter()  # We initialise a counter object (which counts stuff and allows you to get the most common items
    db = get_code_by_date(df, year)  # We get our limited code civil updated by date
    for index, row in db.iterrows():   # We iterate over every article
        c = Counter(row['CText'].split())   # Split into words, and count them
        aggregate_counter += c  # And then them to the main counter

    common_words = []
    common_words_counts = []
    for el in aggregate_counter.most_common(25):  # For the 25 most common items in our counter, we print the name of the item (at index 0), and the count (at index 1)
        #print(el[0], el[1])
        common_words.append(el[0])  # We also add this data to lists that will be used to create a plot
        common_words_counts.append(el[1])

    # Choose a colorful palette
    palette = sns.color_palette("Spectral", len(common_words))  # This palette provides a good range of colors

    bar_plt = sns.barplot(ax=ax.ravel()[e], x = common_words, y = common_words_counts, palette=palette)  # We then create a bar plot with seaborn, this takes two inputs: the names of the words, and a count

    for item in bar_plt.get_xticklabels():
        item.set_rotation(45)
        if item._text in ["héritier", "créancier", "propriétaire"] :
            item.set_fontweight("bold")

plt.show()

1805

<ipython-input-9-91ef5d08e6b7>:22: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  bar_plt = sns.barplot(ax=ax.ravel()[e], x = common_words, y = common_words_counts, palette=palette)  # We then create a bar plot with seaborn, this takes two inputs: the names of the words, and a count

1950

<ipython-input-9-91ef5d08e6b7>:22: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  bar_plt = sns.barplot(ax=ax.ravel()[e], x = common_words, y = common_words_counts, palette=palette)  # We then create a bar plot with seaborn, this takes two inputs: the names of the words, and a count

2000

<ipython-input-9-91ef5d08e6b7>:22: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  bar_plt = sns.barplot(ax=ax.ravel()[e], x = common_words, y = common_words_counts, palette=palette)  # We then create a bar plot with seaborn, this takes two inputs: the names of the words, and a count

2021

<ipython-input-9-91ef5d08e6b7>:22: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  bar_plt = sns.barplot(ax=ax.ravel()[e], x = common_words, y = common_words_counts, palette=palette)  # We then create a bar plot with seaborn, this takes two inputs: the names of the words, and a count

# And now for something more complicated...
df["ID"] = df["Art"] + "_" + df["version"].astype(str)

def get_sents(db):  # First we cut articles in sentences. Note that this could also be done using NLP, but I don't find the performance better in this case
    data = []
    ii = 0
    prev = ""
    for index, art in db.iterrows():
        sents = re.split(r"\.\n|\. (?=[A-Z])", art.Text)
        for sent in sents:
            ii = ii + 1 if art["ID"] == prev else 0
            l = [art["Art"], art["ID"], art["ID"] + "_" + str(ii), art["Date"], sent + ".", len(sent)]
            data.append(l)
            prev = art["ID"]
    return pd.DataFrame(data, columns=["Art", "ID", "Al", "origin_date", "Text", "lenText"])


def get_subj(doc):
    pastsub = ""
    real_subj = "NoSubj"  # We initialise a variable with subject; it will return "NoSubj" if there aren't any subject
    for subj in [x for x in doc if "nsubj" in x.dep_]:  # We iterate through all possible subjects in the sentence
        real_subj = subj
        if subj.pos_ == "PRON":  # If this is a pronoun, however, we say that the pronoum is the previous subject
            real_subj = pastsub
        else:
            pastsub = subj
    if type(real_subj) == spacy.tokens.token.Token:  # Added to provide full noun chunk
        chunks = [x for x in doc.noun_chunks if real_subj in x]
        full_subj = chunks[0] if len(chunks) == 1 else real_subj.lemma_
    else:
        full_subj = ''
    return [real_subj, full_subj]

# Same code as above, but focusing on subjects per sentence thanks to the functions above
fig, ax = plt.subplots(2,2, figsize=(15,15))
fig.suptitle('Most common subjects for given years of Code civil')
for e, year in enumerate(["1805", "1950", "2000", "2021"]):  # A selection of years
    print(year)
    ax.ravel()[e].set_title("Most Common Subject - " + year)
    aggregate_counter = Counter()  # We initialise a counter object (which counts stuff and allows you to get the most common items
    db = get_code_by_date(df, year)  # We get our limited code civil updated by date
    db = get_sents(db)
    db["Subj"] = ""
    for index, row in db.iterrows():
        doc = nlp(row["Text"])
        db.at[index, "Subj"] = get_subj(doc)[0]
    bar_plt = sns.barplot(ax=ax.ravel()[e], x=db.Subj.astype(str).value_counts()[1:20].index,y=db.Subj.astype(str).value_counts()[1:20].values)  # We then create a bar plot with seaborn, this takes two inputs: the names of the words, and a count
    for item in bar_plt.get_xticklabels():
        item.set_rotation(45)

1805
1950
2000
2021

!pip install bertopic  # Takes some time to load

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.11/dist-packages (from bertopic) (1.23.5)
Requirement already satisfied: pandas>=1.1.5 in /usr/local/lib/python3.11/dist-packages (from bertopic) (2.2.2)
Requirement already satisfied: plotly>=4.7.0 in /usr/local/lib/python3.11/dist-packages (from bertopic) (5.24.1)
Requirement already satisfied: scikit-learn>=0.22.2.post1 in /usr/local/lib/python3.11/dist-packages (from bertopic) (1.6.1)
Requirement already satisfied: sentence-transformers>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from bertopic) (3.3.1)
Requirement already satisfied: tqdm>=4.41.1 in /usr/local/lib/python3.11/dist-packages (from bertopic) (4.67.1)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Requirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.11/dist-packages (from hdbscan>=0.8.29->bertopic) (1.13.1)
Requirement already satisfied: joblib>=1.0 in /usr/local/lib/python3.11/dist-packages (from hdbscan>=0.8.29->bertopic) (1.4.2)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.1.5->bertopic) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.1.5->bertopic) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.1.5->bertopic) (2025.1)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.11/dist-packages (from plotly>=4.7.0->bertopic) (9.0.0)
Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from plotly>=4.7.0->bertopic) (24.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.22.2.post1->bertopic) (3.5.0)
Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /usr/local/lib/python3.11/dist-packages (from sentence-transformers>=0.4.1->bertopic) (4.47.1)
Requirement already satisfied: torch>=1.11.0 in /usr/local/lib/python3.11/dist-packages (from sentence-transformers>=0.4.1->bertopic) (2.5.1+cu124)
Requirement already satisfied: huggingface-hub>=0.20.0 in /usr/local/lib/python3.11/dist-packages (from sentence-transformers>=0.4.1->bertopic) (0.27.1)
Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from sentence-transformers>=0.4.1->bertopic) (11.1.0)
Requirement already satisfied: numba>=0.51.2 in /usr/local/lib/python3.11/dist-packages (from umap-learn>=0.5.0->bertopic) (0.60.0)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.17.0)
Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2024.10.0)
Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (6.0.2)
Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.32.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (4.12.2)
Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.11/dist-packages (from numba>=0.51.2->umap-learn>=0.5.0->bertopic) (0.43.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas>=1.1.5->bertopic) (1.17.0)
Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.4.2)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.1.5)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (2.21.5)
Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (12.4.127)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.1.0)
Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (1.13.1)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (1.3.0)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (2024.11.6)
Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (0.21.0)
Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (0.5.2)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.0.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2024.12.14)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 143.7/143.7 kB 5.7 MB/s eta 0:00:00
Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.6/4.6 MB 56.9 MB/s eta 0:00:00
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 88.8/88.8 kB 4.9 MB/s eta 0:00:00
Downloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.9/56.9 kB 4.4 MB/s eta 0:00:00
Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 363.4/363.4 MB 1.4 MB/s eta 0:00:00
Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.8/13.8 MB 87.6 MB/s eta 0:00:00
Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.6/24.6 MB 68.1 MB/s eta 0:00:00
Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 883.7/883.7 kB 44.7 MB/s eta 0:00:00
Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 1.5 MB/s eta 0:00:00
Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 211.5/211.5 MB 6.5 MB/s eta 0:00:00
Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.3/56.3 MB 13.4 MB/s eta 0:00:00
Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 127.9/127.9 MB 7.5 MB/s eta 0:00:00
Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 207.5/207.5 MB 8.1 MB/s eta 0:00:00
Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 73.9 MB/s eta 0:00:00
Installing collected packages: nvidia-nvjitlink-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, pynndescent, nvidia-cusolver-cu12, hdbscan, umap-learn, bertopic
  Attempting uninstall: nvidia-nvjitlink-cu12
    Found existing installation: nvidia-nvjitlink-cu12 12.5.82
    Uninstalling nvidia-nvjitlink-cu12-12.5.82:
      Successfully uninstalled nvidia-nvjitlink-cu12-12.5.82
  Attempting uninstall: nvidia-curand-cu12
    Found existing installation: nvidia-curand-cu12 10.3.6.82
    Uninstalling nvidia-curand-cu12-10.3.6.82:
      Successfully uninstalled nvidia-curand-cu12-10.3.6.82
  Attempting uninstall: nvidia-cufft-cu12
    Found existing installation: nvidia-cufft-cu12 11.2.3.61
    Uninstalling nvidia-cufft-cu12-11.2.3.61:
      Successfully uninstalled nvidia-cufft-cu12-11.2.3.61
  Attempting uninstall: nvidia-cuda-runtime-cu12
    Found existing installation: nvidia-cuda-runtime-cu12 12.5.82
    Uninstalling nvidia-cuda-runtime-cu12-12.5.82:
      Successfully uninstalled nvidia-cuda-runtime-cu12-12.5.82
  Attempting uninstall: nvidia-cuda-nvrtc-cu12
    Found existing installation: nvidia-cuda-nvrtc-cu12 12.5.82
    Uninstalling nvidia-cuda-nvrtc-cu12-12.5.82:
      Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.5.82
  Attempting uninstall: nvidia-cuda-cupti-cu12
    Found existing installation: nvidia-cuda-cupti-cu12 12.5.82
    Uninstalling nvidia-cuda-cupti-cu12-12.5.82:
      Successfully uninstalled nvidia-cuda-cupti-cu12-12.5.82
  Attempting uninstall: nvidia-cublas-cu12
    Found existing installation: nvidia-cublas-cu12 12.5.3.2
    Uninstalling nvidia-cublas-cu12-12.5.3.2:
      Successfully uninstalled nvidia-cublas-cu12-12.5.3.2
  Attempting uninstall: nvidia-cusparse-cu12
    Found existing installation: nvidia-cusparse-cu12 12.5.1.3
    Uninstalling nvidia-cusparse-cu12-12.5.1.3:
      Successfully uninstalled nvidia-cusparse-cu12-12.5.1.3
  Attempting uninstall: nvidia-cudnn-cu12
    Found existing installation: nvidia-cudnn-cu12 9.3.0.75
    Uninstalling nvidia-cudnn-cu12-9.3.0.75:
      Successfully uninstalled nvidia-cudnn-cu12-9.3.0.75
  Attempting uninstall: nvidia-cusolver-cu12
    Found existing installation: nvidia-cusolver-cu12 11.6.3.83
    Uninstalling nvidia-cusolver-cu12-11.6.3.83:
      Successfully uninstalled nvidia-cusolver-cu12-11.6.3.83
Successfully installed bertopic-0.16.4 hdbscan-0.8.40 nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nvjitlink-cu12-12.4.127 pynndescent-0.5.13 umap-learn-0.5.7

from bertopic import BERTopic
import plotly.io as pio

df = pd.read_csv("CassTexts.csv", header="infer", encoding="latin1")
df["CText"] = df.Text.apply(spacy_process)

nlp = spacy.load("fr_core_news_md", disable=["ner", "textcat"])

topic_model = BERTopic(embedding_model=nlp, n_gram_range=(1, 3), min_topic_size=3, nr_topics="auto")  # We set up the topic model
topics, probs = topic_model.fit_transform(df.CText.values.tolist())
topic_model.get_topic_info()  # We visualise the top terms for each topics
df["Topic"] = topics
# df.to_clipboard(index=False, encoding="utf8")

fig = topic_model.visualize_topics()
pio.show(fig)

import gdown
iusct = "https://drive.google.com/uc?id=1mRKp_rYtGlk-UXc_2LpUEuew3zSDhNPB"
iusct_clean = "https://drive.google.com/uc?id=1rzbTTXYoMNmI1WbJPI00-B85c-73qlTR"


# Download the files file into the target directory
gdown.download(iusct, "IUSCT", quiet=False)
gdown.download(iusct_clean, "IUSCT_clean.h5", quiet=False)

import zipfile
import os

with zipfile.ZipFile("IUSCT", 'r') as zip_ref:
    # Extract all the contents into the directory
    zip_ref.extractall(".")

os.chdir("XML")

Downloading...
From: https://drive.google.com/uc?id=1mRKp_rYtGlk-UXc_2LpUEuew3zSDhNPB
To: /content/IUSCT
100%|██████████| 13.7M/13.7M [00:00<00:00, 30.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rzbTTXYoMNmI1WbJPI00-B85c-73qlTR
To: /content/IUSCT_clean.h5
100%|██████████| 60.3M/60.3M [00:00<00:00, 95.8MB/s]

from lxml import etree
import os
import pandas as pd

def parse_xml(file_path):
    # Parse the XML file
    tree = etree.parse(file_path)
    root = tree.getroot()

    # Extract attributes from the body element
    body = root.find('.//body')
    if body is not None:
        metadata = {
            'symb': body.get('symb', ''),
            'claimant': body.get('claimant', ''),
            'respondent': body.get('respondent', ''),
            'case': body.get('Case', ''),
            'author': body.get('Author', ''),
            'doc_number': body.get('DocNumber', ''),
            'date': body.get('Date', ''),
            'doc_type': body.get('DocType', ''),
            'vote': body.get("Vote", '')
        }
        # Concatenate text from all relevant subelements
        text_parts = []
        for elem in body.xpath('.//p | .//heading | .//Para | .//span'):
            if elem.text:
                text_parts.append(elem.text.strip())
        metadata['text'] = ' '.join(text_parts)
        return metadata
    return None

# Directory containing XML files
xml_files = [os.path.join(".", f) for f in os.listdir(".") if f.endswith('.xml')]
data = [parse_xml(file) for file in xml_files if parse_xml(file) is not None]

# Convert list to DataFrame
df = pd.DataFrame(data)

# a bit of cleaning
for index, row in df.iterrows():
  if "issenting" in row["vote"]:
    df.at[index, "doc_type"] = "DISSENTING"
  elif "oncurr" in row["vote"]:
    df.at[index, "doc_type"] = "CONCURRING"

specific_types = ['AWARD ON AGREED TERMS', 'AWARD', 'DISSENTING', 'DECISION', 'CONCURRING']
df['doc_type'] = df['doc_type'].apply(lambda x: x if x in specific_types else 'PARTIAL AWARD')

os.chdir("..")

import spacy
nlp = spacy.load("en_core_web_lg")

# This will take some time, so you can also just load the existing csv instead
def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

df['processed_text'] = df['text'].apply(preprocess_text)

os.chdir("..")

# We use a hdf5 format, an open source file format that supports large, complex, heterogeneous data,
# such as the tuples in the column "entities"
df = pd.read_hdf("IUSCT_clean.h5")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-17-8a1b4353c92f> in <cell line: 0>()
      3 # We use a hdf5 format, an open source file format that supports large, complex, heterogeneous data,
      4 # such as the tuples in the column "entities"
----> 5 df = pd.read_hdf("IUSCT_clean.h5")

/usr/local/lib/python3.11/dist-packages/pandas/io/pytables.py in read_hdf(path_or_buf, key, mode, errors, where, start, stop, columns, iterator, chunksize, **kwargs)
    422 
    423         if not exists:
--> 424             raise FileNotFoundError(f"File {path_or_buf} does not exist")
    425 
    426         store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)

FileNotFoundError: File IUSCT_clean.h5 does not exist

print(df.iloc[1]["text"])
print(df.iloc[1]["processed_text"])

SEPARATE OPINION OF GEORGE H. ALDRICH, RICHARD C. ALLISON, AND
CHARLES T. DUNCAN CONCURRING IN PART AND DISSENTING IN PART We concur in the Award in these two Cases, except for certain
findings in Claim D of Case No. A15(IV) and in Case No. A24. In Claim A of Case No. A15(IV), the Tribunal correctly holds that “[o] bligations under General Principle B are,
generally speaking, obligations of ‘result,' rather than of ‘conduct' or ‘means'.” Award, para. 95. For this reason,
the Tribunal concludes that it need not answer the abstract question of whether the United States breached General
Principle B by authorizing the suspension of litigation rather than its termination, but rather holds that the United
States breached General Principle B only if, in practice, suspension of litigation proved less effective than termination.
Award, paras. 95 and 98. That is, in Claim A the Tribunal concludes that the United States breached General
Principle B only if “Iran was reasonably compelled in the prudent defense of its interests to make appearances or file
documents in United States courts subsequent to 19 July 1981 in any litigation in respect of claims described in Article
II, paragraph 1, of the Claims Settlement Declaration or in respect of claims filed with the Tribunal until such time as
those claims are dismissed by the Tribunal for lack of jurisdiction.” Award, para. 101. We concur in that conclusion. Having followed that logical and pragmatic approach in Claim A, the Tribunal proceeds to ignore it in Claim D of
Case No. A15(IV) and in Case No. A24. The resolution of Claim D follows directly from the resolution of Claim
A, and Case No. A24 is a specific application of the standards set forth in Claim A, yet the Tribunal's resolution
of both of these claims is entirely at odds with Claim A. While in those two cases, the Tribunal properly finds the
same fact-specific, limited liability of the United States—that is, liability only to the extent, if any, that Iran was
reasonably compelled in the prudent defense of its interests to make appearances or file documents in United States
courts in any litigation in respect of claims described in paragraph 101 of the Award—the Tribunal nevertheless, in
Claim D and Case No. A24, unnecessarily and erroneously divines non-existent obligations and holds the United
States in breach thereof. While no additional liability results from these gratuitous findings of breaches, we believe
the Tribunal should have remained consistent with its analysis in Claim A and should have held that the United
States was in breach of its obligations in Claim D and in Case No. A24 only if Iran was reasonably compelled in
the prudent defense of its interests to make appearances or file documents in any litigation in respect of claims
described in paragraph 101 of the Award. We therefore dissent from the Tribunal's conclusion in Claim D that
the United States failed to act consistently with General Principle B by authorizing lawsuits in its courts for the
limited purpose of tolling the statutes of limitations. Award, para. 132. We also dissent from three of the Tribunal's
holdings in Case No. A24: that the claim Foremost filed in the Tribunal was identical with the claim it filed in
District Court; that because the claims were identical, the United States breached the Algiers Declarations by
failing to have Foremost's case in the District Court dismissed within a reasonable time after the Tribunal issued
its award; and, that even if the Foremost's claim in the Tribunal had not been identical with its claim in the District
Court, the United States would still have violated the Algiers Declarations by failing to put the District Court on
notice of that portion of Foremost's claim that had been decided by the Tribunal. Award, paras. 198, 203 and 204. Case No. A15(IV) Claim D By Executive Order 12294 of 24 February 1981 (the “Executive Order”), the President of the United
States suspended the prosecution in United States courts of all claims that might be presented to the Iran- United States Claims Tribunal pursuant to Article II of the Claims Settlement Declaration. The Executive
Order provided that during the period of suspension, the suspended claims would “have no legal effect.” The suspension of any such claim would terminate, however, if the Tribunal were to find that it lacked
jurisdiction over the claim. In order to prevent the passage of time from barring a claim against Iran or
the United States over which the Tribunal would ultimately determine that it did not have jurisdiction, the
Executive Order authorized the filing of an action designed to toll the running of the statute of limitations. While recognizing that General Principle B of the General Declaration did not preclude steps intended
to preserve claims that fell outside the Tribunal's jurisdiction, the Tribunal nevertheless holds in the
context of Iran's Claim D in Case No. A15(IV) that the method selected by the United States to preserve
such claims—the tolling suit—violated its General Principle B commitment to terminate the litigation
of claims against Iran and to prohibit all further litigation based on such claims. Award, paras. 131-132. Simply by posing the question of whether the United States authorization of tolling suits violated General Principle
B, the Tribunal ignores the careful and pragmatic approach it adopted in Claim A. The Tribunal held in Claim A
that the United States could be considered to have violated General Principle B only if Iran was required to file
documents or make appearances with respect to the claims described in paragraph 101, and the proper resolution
of Claim D follows directly from that holding. Pursuant to Claim A, just as it is irrelevant whether, in the abstract,
“suspension” can be said to equal General Principle B's requirement of “termination,” it is likewise irrelevant whether,
in the abstract, the tolling suit mechanism can be said to be inconsistent with the requirements of General Principle
B. Whether a suit had been filed in the United States courts before the signing of the Algiers Declarations or only
after by means of a tolling suit, the scope of United States liability is the same: The United States breached General
Principle B and must pay damages to the extent, if any, that Iran was compelled in the prudent defense of its interests
to file documents or make appearances in United States courts with respect to the claims discussed in paragraph 101.
And that liability is unrelated to any conclusion the Tribunal might reach as to the appropriateness of the tolling suit
mechanism; that is, the Tribunal's conclusion that the tolling suit mechanism was inconsistent with General Principle
B does not subject the United States to any liability if Iran was not compelled to file documents or make appearances
in United States courts with respect to such suits. The Tribunal recognizes this in its description of United States
liability in paragraph 133. Conversely, the United States could not escape liability had the Tribunal determined that
the tolling suit mechanism was consistent with General Principle B if Iran was in fact compelled to file documents or
make appearances in United States courts with respect to such suits. The United States obligation is one of result. Regrettably, the Tribunal's conclusion that the tolling suit mechanism violates General Principle B is not
only unnecessary and without effect on the liability of the United States; it is also wrong. It is too evident for discussion that a state party to an international agreement is bound to implement its terms in good faith. See Islamic
Republic of Iran and United States of America, DEC 62-A21-FT, para. 14 (4 May 1987) reprinted in 14 Iran- U.S. C.T.R. 324, 330. However, as the Award correctly states at paragraph 96, the specific manner of compliance
is—unless it is stipulated in the agreement itself—left up to the complying state. Id. at 331. That is not to say
that the complying state has absolute discretion as to whether its chosen mechanism for compliance is consonant
with the terms of the treaty. The mechanism must by any objective standard be both reasonable and effective. The tolling suit mechanism was both. Paralyzed and of “no legal effect” from the moment of its filing, the tolling suit can hardly be deemed “litigation” in any normal sense. Moreover, any of the other conceivable methods by
which the United States might have preserved the claims pending the Tribunal's jurisdictional determination were
certain to have raised complex and difficult issues that would have been costly and vexatious to Iran. The tolling
suit mechanism raised none of those difficulties; most importantly, it gave Iran exactly what it was entitled to
expect under General Principle B—freedom from litigation of claims that were to be decided by the Tribunal. Consequently, we dissent from the Tribunal's conclusion in paragraphs 131 and 132 of the Award that the
tolling suit device was inconsistent with the obligations of the United States contained in General Principle B. Case No. A24 In Claim A of Case No. A15(IV), the Tribunal set forth with great care and detail the obligations the United
States assumed in adhering to General Principle B. The Tribunal held that the United States breached
General Principle B if Iran was reasonably compelled in the prudent defense of its interests to make
appearances or file documents subsequent to 19 July 1981 in any litigation in respect of claims described
in paragraph 101. It went on say that it would, in the second phase of proceedings, apply that standard
to the facts of the cases in American courts in which Iran claims a violation. However, one of those cases, Case No. A24, had already been consolidated with Case No. A15(IV) and so provided the Tribunal its first
opportunity to apply the standards it established in Claim A to a particular case filed in United States courts. In 1959, the Foremost group of companies (“Foremost”) joined a group of Iranian citizens to
establish a dairy, Sherkat Sahami Labaniat Pasteurize Pak (“Pak Dairy”), in Iran. Foremost's equity
interests in Pak Dairy fluctuated over the years until, by 1979, Foremost owned or controlled 31%
of Pak Dairy's shares. During its 20 years of involvement with Pak Dairy, and regardless of its
level of ownership interests therein, Foremost played a major role in the company's management. According to Foremost, beginning in late 1978, the Iranian government entities that were shareholders in
Pak Dairy launched a concerted effort to drive Foremost out of Pak Dairy. Foremost believed that the Iranian government entities undertook a series of actions over a period of three to four years which had the cumulative effect
of depriving Foremost of the use and benefit of its 31% interest in the company. Consequently, Foremost filed law
suits against I in Pak Dairy. Claim before the Iran-United States Claims Tribunal. The Tribunal has jurisdiction only over claims outstanding
on 19 January 1981, so for Foremost to prevail in the Tribunal, it had to prove that Iran's creeping expropriation
had culminated by that date. See Claims Settlement Declaration, Art. II, para. 1. Fully aware of this jurisdictional
requirement, Foremost alleged in its Statements of Claim that “[t]he claims asserted herein were outstanding on January 19, 1981....” Next, Foremost filed a complaint against Iran in the United States District Court for the District of Columbia for the purpose of tolling the applicable statute of limitations. The United States District
Court's jurisdiction is not limited to claims arising by 19 January 1981, so Foremost's complaint before the
District Court contains no specific allegations as to the date at which Iran's actions ripened into an expropriation. Foremost's claim before the District Court was immediately suspended by Executive Order 12294, and the Tribunal
proceeded to decide the claims before it. On 11 April 1986, the Tribunal decided, among other things, that “on
balance [Iran's] interference with the substance of Foremost's rights did not, by 19 January 1981, ... amount to an
expropriation.” See Foremost Tehran, et al. and Islamic Republic of Iran, et al., Award No. 220-37/231-1 (11 April
1986), reprinted in 10 Iran-U.S. C.T.R. 228, 250. Two years later, on 1 April 1988, Foremost revived its suit in the
District Court by filing a Motion for Partial Summary Judgment. As the Award notes in para. 207, Foremost made
clear in that Motion that it accepted the Tribunal's determination that the expropriation did not culminate prior to
19 January 1981 and was pursuing in the District Court only a claim for an expropriation culminating after that date. The Tribunal, in Case No. A24, states at the outset that the Statement of Claim Foremost filed in Case
No. 37 before the Tribunal and the complaint it filed before the District Court “were identical.” Award, para. 198. The Tribunal then reasons that because the two claims were identical, the United States should
have had Foremost's suit in the District Court dismissed within a reasonable time after the Tribunal issued
its award, and by failing to do so, the United States “violated its obligation under the Algiers Declarations to terminate finally litigation in United States courts related to claims resolved by the Tribunal on the
merits.” Award, para. 203. Second, the Tribunal states in dictum that even if the original complaint filed in
the District Court had been broader than the Statement of Claim, the United States still would have been
obliged under the Algiers Declarations to put the District Court on notice, within a reasonable time
after 11 April 1986, that proceedings in the Foremost/OPIC lawsuit were to be considered terminated to
the extent they related to that portion of the claim that had been decided by the Tribunal in Foremost
(i.e., the claim for a pre-19 January 1981 expropriation of Foremost's ownership interest in Pak Dairy). Award, para. 204. The Tribunal errs in both its reasoning and its conclusions. The Tribunal's initial conclusion—that Foremost's Statement of Claim and its District Court complaint
were identical—is patently wrong. The Tribunal acknowledges that the Statement of Claim contained the
allegation that Foremost's claim for expropriation was outstanding on 19 January 1981 while the complaint
contained no such allegation, but the Tribunal concludes that “this apparent difference does not mean that
the Complaint before the District Court was broader than the Statements of Claim before the Tribunal in
that it arguably also included an expropriation claim that arose after 19 January 1981.” Award, para. 199. Far from being an “apparent” difference, the fact that the Statements of Claim alleged an expropriation
which culminated by 19 January 1981 while the complaint contained no such date restriction is a real and
important textual difference which reflects the real and decisive difference between the two claims. Foremost
claimed in the Tribunal that Iran had effected a creeping expropriation by 19 January 1981. Foremost's claim
in the District Court was not so limited; in the District Court Foremost alleged a creeping expropriation which ri 1981. Indeed, because the District Court's jurisdiction is not confined to claims arising by 19 January
1981, Foremost had no reason to restrict its claim to that date and every reason not to. The purpose of the
tolling suit in the District Court was to preserve Foremost's rights in the event the Tribunal subsequently
determined that it lacked jurisdiction over that claim, and the only plausible ground for the Tribunal to
find lack of jurisdiction was the one it in fact found: that actions for which Iran was responsible had not yet
ripened by 19 January 1981 into an expropriation. Thus, there would have been no point to Foremost's even
filing the tolling suit had it been limited to the claim that the expropriation occurred by 19 January 1981. Failing to recognize this, the Tribunal erroneously finds that Foremost filed a claim in the District Court
that was identical to the claim it filed in the Tribunal; it then reasons from that finding that the United States should have had Foremost's suit in the District Court dismissed within a reasonable time after the Tribunal
issued its award in Foremost and that by failing to do so, the United States “violated its obligation under
the Algiers Declarations to terminate finally litigation in United States courts related to claims resolved
by the Tribunal on the merits.” Award, para. 203. In so holding, the Tribunal creates an obligation that
is both wrong on the merits and that flouts the standards the Tribunal so carefully set forth in Claim A. The Tribunal assumes in Case No. A24 that, in order to satisfy its obligation to terminate finally cases decided by
the Tribunal on the merits, the United States is obliged to remove cases from the dockets of the relevant United
States courts once the Tribunal has issued its awards in those cases on the merits. The holding is wrong because the Algiers Declarations impose on the United States no such obligation. In our opinion, the United States satisfied
its obligation to terminate finally claims decided by the Tribunal on the merits by promulgating Section 4 of
Executive Order 12294, which provides that “[a] determination by the Iran-United States Claims Tribunal on
the merits that a claimant is not entitled to recover on a claim shall operate as a final resolution and discharge
of the claim for all purposes.” In Claim A the Tribunal set forth the standards applicable to cases such as Case
No. A24, and it notably did not impose on the United States an obligation to remove cases from the dockets of
United States courts. The reason the Tribunal imposes no such obligation on the United States in Claim A is that
the approach the Tribunal does adopt in Claim A precludes such an imposition. The Tribunal states in Claim
A that the United States can be considered to have breached General Principle B only if Iran was reasonably
compelled in the prudent defense of its interests to make appearances or file documents in the cases described in
paragraph 101. Thus, a dead case which remains in name only on the docket of a United States court after the
Tribunal has issued an award on the merits of the case is not a violation of General Principle B. Only if Iran was
required to file documents or make appearances in such a case would the United States violate General Principle B. One sees, then, that the determination of whether the claim Foremost filed in the Tribunal was identical to the claim
it filed in the District Court is important only insofar as it defines the scope of potential United States liability. If the
claims had been identical, then the United States would be obliged to pay Iran damages for each instance in which
Iran was reasonably compelled in the prudent defense of its interests to file documents or make appearances in the
District Court between 11 April 1986 and 1 April 1988. However, since, as we believe, the claim in the District Court
was, from its inception, broader than the claim before the Tribunal, the United States liability should extend only to
instances in which Iran was reasonably compelled in the prudent defense of its interests to file documents or make
appearances after 11 April 1986 in that portion of the case that had been decided by the Tribunal on the merits. The Tribunal's alternative dictum for imposing liability on the United States in Case No. A24 is equally
inappropriate. The Tribunal holds in the alternative that even if it believed that the complaint in the District Court had been broader than the Statement of Claim in the Tribunal, the United States would still have been obliged “to put
the District Court on notice” within a reasonable time after 11 April 1986 that proceedings in the Foremost lawsuit
were to be considered terminated to the extent that they related to that portion of the claim that the Tribunal had
decided.” Award, para. 204. Again, we disagree—here even more vehemently—with the Tribunal's determination
that such an obligation exists. In announcing the obligation, the Tribunal refers to no provision of the Algiers
Declarations, and that is because there is no such provision. The Algiers Declarations contain no language whatever
that can be interpreted as requiring the Executive Branch of the United States government to remind the Judicial
Branch—a co-equal branch of the same government—of the obvious: that it must comply with the United States
obligations under the Algiers Declarations. The Tribunal likewise errs in not recognizing that even if General Principle
B did obligate the Executive Branch to “put the District Court on notice” that the Foremost proceedings were to
be considered terminated to the extent that they related to that portion of the claim the Tribunal had decided, the
Executive Branch would have satisfied that obligation by its promulgation of Section 4 of Executive Order 12294.
As noted above, section 4 provides as a matter of United States law that “[a] determination by the Iran-United
States Claims Tribunal on the merits that a claimant is not entitled to recover on a claim shall operate as a final
resolution and discharge of the claim for all purposes.” No clearer “notice” can be imagined, nor was any necessary. But, ultimately, the Tribunal errs, yet again, in reaching a conclusion precluded by the Tribunal's resolution of Claim
A. If, as the Tribunal's alternative theory correctly presumes, the claim Foremost filed in the District Court was
not identical to the claim it filed in the Tribunal, then pursuant to Claim A, the United States breached the Algiers
Declarations only if Iran was compelled in the prudent defense of its interests to file documents or make appearances
in the District Court in that portion of the case that had been decided by the Tribunal on the merits. The United
States liability is in no way affected by whether or not the Executive Branch “put the District Court on notice.”
Regardless of any measures the Executive Branch might have taken, the United States remains liable to Iran if and
only if Iran had to file documents or make appearances in the District Court in that portion of the case decided by
the Tribunal on the merits. In the Tribunal's words in Claim A, “the test is in factual evidence.” Award, para. 95. Case No. A24 should be a very easy case. The only necessary task before the Tribunal was to determine
whether the claim Foremost filed in the District Court was identical to the claim it filed in the Tribunal. Once that determination was made, the Tribunal's resolution of Claim A supplied the relevant standards of liability.
The Tribunal has at least applied the latter correctly: in paragraph 205, the Tribunal concludes that the United
States is liable to Iran for “damages to the extent [Iran] was reasonably compelled in the prudent defense of
its interests to make appearances or file documents with respect to the Foremost/OPIC lawsuit from 11 April
1986 until 1 April 1988, to the extent those expenses are not already sought by Iran in Case No. A15(IV).”
Thus, the Tribunal's final conclusion as to liability is consistent with Claim A as well as with the Tribunal's
initial but erroneous determination that Foremost's two claims were identical. In between the Tribunal's initial
determination and its final conclusion, however, the Tribunal implies into General Principle B two obligations that
are both wrong on the merits and in direct contravention of the standards the Tribunal established in Claim A. For these many reasons, we dissent from Part VI(B)(3)(a) of the Award; we believe that Iran's entire claim
in Case No. A24 should be dismissed. We, therefore, concur in Part VI(B)(3)(b) of the Award in which the
Tribunal dismisses Iran's claim in Case No. A24 to the extent it relates to the period beginning 1 April 1988.
Dated, The Hague, 28 December 1998 George H. Aldrich
Richard C. Allison
Charles T. Duncan Iran-U.S.Cl.Trib. 1998 Islamic Republic of Iran v. United States of America Footnotes Iran Award 590-A15(IV)/A24-FT (Iran-U.S.Cl.Trib.), 1998 WL 930569 End of Document © 2018 Thomson Reuters. No claim to original U.S. Government Works. © 2018 Thomson Reuters. No claim to original U.S. Government Works. 6
separate opinion george h. aldrich richard c. allison 
 charles t. duncan concur dissenting concur award case certain 
 finding claim d case a15(iv case a24 claim case a15(iv tribunal correctly hold o bligation general principle b 
 generally speak obligation result conduct mean award para 95 reason 
 tribunal conclude need answer abstract question united states breach general 
 principle b authorize suspension litigation termination hold united 
 states breach general principle b practice suspension litigation prove effective termination 
 award para 95 98 claim tribunal conclude united states breach general 
 principle b iran reasonably compel prudent defense interest appearance file 
 document united states court subsequent 19 july 1981 litigation respect claim describe article 
 ii paragraph 1 claims settlement declaration respect claim file tribunal time 
 claim dismiss tribunal lack jurisdiction award para 101 concur conclusion having follow logical pragmatic approach claim tribunal proceed ignore claim d 
 case a15(iv case a24 resolution claim d follow directly resolution claim 
 case a24 specific application standard set forth claim tribunal resolution 
 claim entirely odd claim a. case tribunal properly find 
 fact specific limited liability united states liability extent iran 
 reasonably compel prudent defense interest appearance file document united states 
 court litigation respect claim describe paragraph 101 award tribunal 
 claim d case a24 unnecessarily erroneously divine non existent obligation hold united 
 states breach thereof additional liability result gratuitous finding breach believe 
 tribunal remain consistent analysis claim hold united 
 states breach obligation claim d case a24 iran reasonably compel 
 prudent defense interest appearance file document litigation respect claim 
 describe paragraph 101 award dissent tribunal conclusion claim d 
 united states fail act consistently general principle b authorize lawsuit court 
 limited purpose toll statute limitation award para 132 dissent tribunal 
 holding case a24 claim foremost file tribunal identical claim file 
 district court claim identical united states breach algiers declarations 
 fail foremost case district court dismiss reasonable time tribunal issue 
 award foremost claim tribunal identical claim district 
 court united states violate algiers declarations fail district court 
 notice portion foremost claim decide tribunal award paras 198 203 204 case a15(iv claim d executive order 12294 24 february 1981 executive order president united 
 states suspend prosecution united states court claim present iran- united states claims tribunal pursuant article ii claims settlement declaration executive 
 order provide period suspension suspend claim legal effect suspension claim terminate tribunal find lack 
 jurisdiction claim order prevent passage time bar claim iran 
 united states tribunal ultimately determine jurisdiction 
 executive order authorize filing action design toll running statute limitation recognize general principle b general declaration preclude step intend 
 preserve claim fall outside tribunal jurisdiction tribunal hold 
 context iran claim d case a15(iv method select united states preserve 
 claim tolling suit violate general principle b commitment terminate litigation 
 claim iran prohibit litigation base claim award paras 131 132 simply pose question united states authorization tolling suit violate general principle 
 b tribunal ignore careful pragmatic approach adopt claim a. tribunal hold claim 
 united states consider violate general principle b iran require file 
 document appearance respect claim describe paragraph 101 proper resolution 
 claim d follow directly holding pursuant claim irrelevant abstract 
 suspension say equal general principle b requirement termination likewise irrelevant 
 abstract tolling suit mechanism say inconsistent requirement general principle 
 b. suit file united states court signing algiers declarations 
 mean tolling suit scope united states liability united states breach general 
 principle b pay damage extent iran compel prudent defense interest 
 file document appearance united states court respect claim discuss paragraph 101 
 liability unrelated conclusion tribunal reach appropriateness tolling suit 
 mechanism tribunal conclusion tolling suit mechanism inconsistent general principle 
 b subject united states liability iran compel file document appearance 
 united states court respect suit tribunal recognize description united states 
 liability paragraph 133 conversely united states escape liability tribunal determine 
 tolling suit mechanism consistent general principle b iran fact compel file document 
 appearance united states court respect suit united states obligation result regrettably tribunal conclusion tolling suit mechanism violate general principle b 
 unnecessary effect liability united states wrong evident discussion state party international agreement bind implement term good faith islamic 
 republic iran united states america dec 62 a21 ft para 14 4 1987 reprint 14 iran- u.s. c.t.r. 324 330 award correctly state paragraph 96 specific manner compliance 
 stipulate agreement leave comply state id. 331 
 comply state absolute discretion choose mechanism compliance consonant 
 term treaty mechanism objective standard reasonable effective tolling suit mechanism paralyzed legal effect moment filing tolling suit hardly deem litigation normal sense conceivable method 
 united states preserve claim pende tribunal jurisdictional determination 
 certain raise complex difficult issue costly vexatious iran tolling 
 suit mechanism raise difficulty importantly give iran exactly entitle 
 expect general principle b freedom litigation claim decide tribunal consequently dissent tribunal conclusion paragraph 131 132 award 
 tolling suit device inconsistent obligation united states contain general principle b. case a24 claim case a15(iv tribunal set forth great care detail obligation united 
 states assume adhere general principle b. tribunal hold united states breach 
 general principle b iran reasonably compel prudent defense interest 
 appearance file document subsequent 19 july 1981 litigation respect claim describe 
 paragraph 101 go second phase proceeding apply standard 
 fact case american court iran claim violation case case a24 consolidate case a15(iv provide tribunal 
 opportunity apply standard establish claim particular case file united states court 1959 foremost group company foremost join group iranian citizen 
 establish dairy sherkat sahami labaniat pasteurize pak pak dairy iran foremost equity 
 interest pak dairy fluctuate year 1979 foremost own control 31 
 pak dairy share 20 year involvement pak dairy regardless 
 level ownership interest foremost play major role company management accord foremost begin late 1978 iranian government entity shareholder 
 pak dairy launch concerted effort drive foremost pak dairy foremost believe iranian government entity undertake series action period year cumulative effect 
 deprive foremost use benefit 31 interest company consequently foremost file law 
 suit pak dairy claim iran united states claims tribunal tribunal jurisdiction claim outstanding 
 19 january 1981 foremost prevail tribunal prove iran creep expropriation 
 culminate date claims settlement declaration art ii para 1 fully aware jurisdictional 
 requirement foremost allege statement claim t]he claim assert outstanding january 19 1981 foremost file complaint iran united states district court district columbia purpose toll applicable statute limitation united states district 
 court jurisdiction limit claim arise 19 january 1981 foremost complaint 
 district court contain specific allegation date iran action ripen expropriation foremost claim district court immediately suspend executive order 12294 tribunal 
 proceed decide claim 11 april 1986 tribunal decide thing 
 balance iran interference substance foremost right 19 january 1981 
 expropriation foremost tehran et al islamic republic iran et al award 220 37/231 1 11 april 
 1986 reprint 10 iran u.s. c.t.r. 228 250 year later 1 april 1988 foremost revive suit 
 district court file motion partial summary judgment award note para 207 foremost 
 clear motion accept tribunal determination expropriation culminate prior 
 19 january 1981 pursue district court claim expropriation culminate date tribunal case a24 state outset statement claim foremost file case 
 37 tribunal complaint file district court identical award para 198 tribunal reason claim identical united states 
 foremost suit district court dismiss reasonable time tribunal issue 
 award fail united states violate obligation algiers declarations terminate finally litigation united states court relate claim resolve tribunal 
 merit award para 203 second tribunal state dictum original complaint file 
 district court broad statement claim united states 
 oblige algiers declarations district court notice reasonable time 
 11 april 1986 proceeding foremost opic lawsuit consider terminate 
 extent relate portion claim decide tribunal foremost 
 i.e. claim pre-19 january 1981 expropriation foremost ownership interest pak dairy award para 204 tribunal err reasoning conclusion tribunal initial conclusion foremost statement claim district court complaint 
 identical patently wrong tribunal acknowledge statement claim contain 
 allegation foremost claim expropriation outstanding 19 january 1981 complaint 
 contain allegation tribunal conclude apparent difference mean 
 complaint district court broad statement claim tribunal 
 arguably include expropriation claim arise 19 january 1981 award para 199 far apparent difference fact statement claim allege expropriation 
 culminate 19 january 1981 complaint contain date restriction real 
 important textual difference reflect real decisive difference claim foremost 
 claim tribunal iran effect creep expropriation 19 january 1981 foremost claim 
 district court limited district court foremost allege creep expropriation ri 1981 district court jurisdiction confine claim arise 19 january 
 1981 foremost reason restrict claim date reason purpose 
 tolling suit district court preserve foremost right event tribunal subsequently 
 determine lack jurisdiction claim plausible ground tribunal 
 find lack jurisdiction fact find action iran responsible 
 ripen 19 january 1981 expropriation point foremost 
 file tolling suit limit claim expropriation occur 19 january 1981 fail recognize tribunal erroneously find foremost file claim district court 
 identical claim file tribunal reason finding united states foremost suit district court dismiss reasonable time tribunal 
 issue award foremost fail united states violate obligation 
 algiers declarations terminate finally litigation united states court relate claim resolve 
 tribunal merit award para 203 hold tribunal create obligation 
 wrong merit flout standard tribunal carefully set forth claim a. tribunal assume case a24 order satisfy obligation terminate finally case decide 
 tribunal merit united states oblige remove case docket relevant united 
 states court tribunal issue award case merit holding wrong algiers declaration impose united states obligation opinion united states satisfy 
 obligation terminate finally claim decide tribunal merit promulgate section 4 
 executive order 12294 provide determination iran united states claim tribunal 
 merit claimant entitle recover claim shall operate final resolution discharge 
 claim purpose claim tribunal set forth standard applicable case case 
 a24 notably impose united states obligation remove case docket 
 united states court reason tribunal impose obligation united states claim 
 approach tribunal adopt claim preclude imposition tribunal state claim 
 united states consider breach general principle b iran reasonably 
 compel prudent defense interest appearance file document case describe 
 paragraph 101 dead case remain docket united states court 
 tribunal issue award merit case violation general principle b. iran 
 require file document appearance case united states violate general principle b. see determination claim foremost file tribunal identical claim 
 file district court important insofar define scope potential united states liability 
 claim identical united states oblige pay iran damage instance 
 iran reasonably compel prudent defense interest file document appearance 
 district court 11 april 1986 1 april 1988 believe claim district court 
 inception broad claim tribunal united states liability extend 
 instance iran reasonably compel prudent defense interest file document 
 appearance 11 april 1986 portion case decide tribunal merit tribunal alternative dictum impose liability united states case a24 equally 
 inappropriate tribunal hold alternative believe complaint district court broad statement claim tribunal united states oblige 
 district court notice reasonable time 11 april 1986 proceeding foremost lawsuit 
 consider terminate extent relate portion claim tribunal 
 decide award para 204 disagree vehemently tribunal determination 
 obligation exist announce obligation tribunal refer provision algiers 
 declaration provision algiers declaration contain language 
 interpret require executive branch united states government remind judicial 
 branch co equal branch government obvious comply united states 
 obligation algiers declaration tribunal likewise err recognize general principle 
 b obligate executive branch district court notice foremost proceeding 
 consider terminate extent relate portion claim tribunal decide 
 executive branch satisfy obligation promulgation section 4 executive order 12294 
 note section 4 provide matter united states law determination iran united 
 states claims tribunal merit claimant entitle recover claim shall operate final 
 resolution discharge claim purpose clear notice imagine necessary ultimately tribunal err reach conclusion preclude tribunal resolution claim 
 a. tribunal alternative theory correctly presume claim foremost file district court 
 identical claim file tribunal pursuant claim united states breach algiers 
 declaration iran compel prudent defense interest file document appearance 
 district court portion case decide tribunal merit united 
 states liability way affect executive branch district court notice 
 regardless measure executive branch take united states remain liable iran 
 iran file document appearance district court portion case decide 
 tribunal merit tribunal word claim test factual evidence award para 95 case a24 easy case necessary task tribunal determine 
 claim foremost file district court identical claim file tribunal determination tribunal resolution claim supply relevant standard liability 
 tribunal apply correctly paragraph 205 tribunal conclude united 
 states liable iran damage extent iran reasonably compel prudent defense 
 interest appearance file document respect foremost opic lawsuit 11 april 
 1986 1 april 1988 extent expense seek iran case a15(iv 
 tribunal final conclusion liability consistent claim tribunal 
 initial erroneous determination foremost claim identical tribunal initial 
 determination final conclusion tribunal imply general principle b obligation 
 wrong merit direct contravention standard tribunal establish claim a. reason dissent vi(b)(3)(a award believe iran entire claim 
 case a24 dismiss concur vi(b)(3)(b award 
 tribunal dismiss iran claim case a24 extent relate period begin 1 april 1988 
 date hague 28 december 1998 george h. aldrich 
 richard c. allison 
 charles t. duncan iran u.s.cl trib 1998 islamic republic iran v. united states america footnotes iran award 590 a15(iv)/a24 ft iran u.s.cl trib 1998 wl 930569 end document © 2018 thomson reuters claim original u.s. government works © 2018 thomson reuters claim original u.s. government works 6

import spacy
nlp = spacy.load("en_core_web_lg")

def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['entities'] = df['text'].apply(extract_entities)

types = ['DATE', 'GPE', 'ORG', 'PERSON', 'WORK_OF_ART', 'NORP', 'LAW', 'PRODUCT', 'MONEY', 'CARDINAL', 'ORDINAL', 'LOC', 'FAC', 'LANGUAGE', 'PERCENT', 'TIME', 'QUANTITY', 'EVENT']

Exception ignored in: <function _xla_gc_callback at 0x7fcaedd55bc0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_gc_callback(*args):
    
KeyboardInterrupt:

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-19-c3ef70c42eab> in <cell line: 0>()
      6     return [(ent.text, ent.label_) for ent in doc.ents]
      7 
----> 8 df['entities'] = df['text'].apply(extract_entities)
      9 
     10 types = ['DATE', 'GPE', 'ORG', 'PERSON', 'WORK_OF_ART', 'NORP', 'LAW', 'PRODUCT', 'MONEY', 'CARDINAL', 'ORDINAL', 'LOC', 'FAC', 'LANGUAGE', 'PERCENT', 'TIME', 'QUANTITY', 'EVENT']

/usr/local/lib/python3.11/dist-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, by_row, **kwargs)
   4922             args=args,
   4923             kwargs=kwargs,
-> 4924         ).apply()
   4925 
   4926     def _reindex_indexer(

/usr/local/lib/python3.11/dist-packages/pandas/core/apply.py in apply(self)
   1425 
   1426         # self.func is Callable
-> 1427         return self.apply_standard()
   1428 
   1429     def agg(self):

/usr/local/lib/python3.11/dist-packages/pandas/core/apply.py in apply_standard(self)
   1505         #  Categorical (GH51645).
   1506         action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None
-> 1507         mapped = obj._map_values(
   1508             mapper=curried, na_action=action, convert=self.convert_dtype
   1509         )

/usr/local/lib/python3.11/dist-packages/pandas/core/base.py in _map_values(self, mapper, na_action, convert)
    919             return arr.map(mapper, na_action=na_action)
    920 
--> 921         return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)
    922 
    923     @final

/usr/local/lib/python3.11/dist-packages/pandas/core/algorithms.py in map_array(arr, mapper, na_action, convert)
   1741     values = arr.astype(object, copy=False)
   1742     if na_action is None:
-> 1743         return lib.map_infer(values, mapper, convert=convert)
   1744     else:
   1745         return lib.map_infer_mask(

lib.pyx in pandas._libs.lib.map_infer()

<ipython-input-19-c3ef70c42eab> in extract_entities(text)
      3 
      4 def extract_entities(text):
----> 5     doc = nlp(text)
      6     return [(ent.text, ent.label_) for ent in doc.ents]
      7 

/usr/local/lib/python3.11/dist-packages/spacy/language.py in __call__(self, text, disable, component_cfg)
   1050                 error_handler = proc.get_error_handler()
   1051             try:
-> 1052                 doc = proc(doc, **component_cfg.get(name, {}))  # type: ignore[call-arg]
   1053             except KeyError as e:
   1054                 # This typically happens if a component is not initialized

/usr/local/lib/python3.11/dist-packages/spacy/pipeline/trainable_pipe.pyx in spacy.pipeline.trainable_pipe.TrainablePipe.__call__()

/usr/local/lib/python3.11/dist-packages/spacy/pipeline/transition_parser.pyx in spacy.pipeline.transition_parser.Parser.predict()

/usr/local/lib/python3.11/dist-packages/spacy/pipeline/transition_parser.pyx in spacy.pipeline.transition_parser.Parser.greedy_parse()

/usr/local/lib/python3.11/dist-packages/thinc/model.py in predict(self, X)
    332         only the output, instead of the `(output, callback)` tuple.
    333         """
--> 334         return self._func(self, X, is_train=False)[0]
    335 
    336     def finish_update(self, optimizer: Optimizer) -> None:

/usr/local/lib/python3.11/dist-packages/spacy/ml/tb_framework.py in forward(model, X, is_train)
     32 
     33 def forward(model, X, is_train):
---> 34     step_model = ParserStepModel(
     35         X,
     36         model.layers,

/usr/local/lib/python3.11/dist-packages/spacy/ml/parser_model.pyx in spacy.ml.parser_model.ParserStepModel.__init__()

/usr/local/lib/python3.11/dist-packages/thinc/model.py in __call__(self, X, is_train)
    308         """Call the model's `forward` function, returning the output and a
    309         callback to compute the gradients via backpropagation."""
--> 310         return self._func(self, X, is_train=is_train)
    311 
    312     def initialize(self, X: Optional[InT] = None, Y: Optional[OutT] = None) -> "Model":

/usr/local/lib/python3.11/dist-packages/thinc/layers/chain.py in forward(model, X, is_train)
     52     callbacks = []
     53     for layer in model.layers:
---> 54         Y, inc_layer_grad = layer(X, is_train=is_train)
     55         callbacks.append(inc_layer_grad)
     56         X = Y

/usr/local/lib/python3.11/dist-packages/thinc/model.py in __call__(self, X, is_train)
    308         """Call the model's `forward` function, returning the output and a
    309         callback to compute the gradients via backpropagation."""
--> 310         return self._func(self, X, is_train=is_train)
    311 
    312     def initialize(self, X: Optional[InT] = None, Y: Optional[OutT] = None) -> "Model":

/usr/local/lib/python3.11/dist-packages/thinc/layers/chain.py in forward(model, X, is_train)
     52     callbacks = []
     53     for layer in model.layers:
---> 54         Y, inc_layer_grad = layer(X, is_train=is_train)
     55         callbacks.append(inc_layer_grad)
     56         X = Y

/usr/local/lib/python3.11/dist-packages/thinc/model.py in __call__(self, X, is_train)
    308         """Call the model's `forward` function, returning the output and a
    309         callback to compute the gradients via backpropagation."""
--> 310         return self._func(self, X, is_train=is_train)
    311 
    312     def initialize(self, X: Optional[InT] = None, Y: Optional[OutT] = None) -> "Model":

/usr/local/lib/python3.11/dist-packages/thinc/layers/with_array.py in forward(model, Xseq, is_train)
     40         return model.layers[0](Xseq, is_train)
     41     else:
---> 42         return cast(Tuple[SeqT, Callable], _list_forward(model, Xseq, is_train))
     43 
     44 

/usr/local/lib/python3.11/dist-packages/thinc/layers/with_array.py in _list_forward(model, Xs, is_train)
     75     lengths = NUMPY_OPS.asarray1i([len(seq) for seq in Xs])
     76     Xf = layer.ops.flatten(Xs, pad=pad)
---> 77     Yf, get_dXf = layer(Xf, is_train)
     78 
     79     def backprop(dYs: ListXd) -> ListXd:

/usr/local/lib/python3.11/dist-packages/thinc/model.py in __call__(self, X, is_train)
    308         """Call the model's `forward` function, returning the output and a
    309         callback to compute the gradients via backpropagation."""
--> 310         return self._func(self, X, is_train=is_train)
    311 
    312     def initialize(self, X: Optional[InT] = None, Y: Optional[OutT] = None) -> "Model":

/usr/local/lib/python3.11/dist-packages/thinc/layers/chain.py in forward(model, X, is_train)
     52     callbacks = []
     53     for layer in model.layers:
---> 54         Y, inc_layer_grad = layer(X, is_train=is_train)
     55         callbacks.append(inc_layer_grad)
     56         X = Y

/usr/local/lib/python3.11/dist-packages/thinc/model.py in __call__(self, X, is_train)
    308         """Call the model's `forward` function, returning the output and a
    309         callback to compute the gradients via backpropagation."""
--> 310         return self._func(self, X, is_train=is_train)
    311 
    312     def initialize(self, X: Optional[InT] = None, Y: Optional[OutT] = None) -> "Model":

/usr/local/lib/python3.11/dist-packages/thinc/layers/residual.py in forward(model, X, is_train)
     39             return d_output + dX
     40 
---> 41     Y, backprop_layer = model.layers[0](X, is_train)
     42     if isinstance(X, list):
     43         return [X[i] + Y[i] for i in range(len(X))], backprop

/usr/local/lib/python3.11/dist-packages/thinc/model.py in __call__(self, X, is_train)
    308         """Call the model's `forward` function, returning the output and a
    309         callback to compute the gradients via backpropagation."""
--> 310         return self._func(self, X, is_train=is_train)
    311 
    312     def initialize(self, X: Optional[InT] = None, Y: Optional[OutT] = None) -> "Model":

/usr/local/lib/python3.11/dist-packages/thinc/layers/chain.py in forward(model, X, is_train)
     52     callbacks = []
     53     for layer in model.layers:
---> 54         Y, inc_layer_grad = layer(X, is_train=is_train)
     55         callbacks.append(inc_layer_grad)
     56         X = Y

/usr/local/lib/python3.11/dist-packages/thinc/model.py in __call__(self, X, is_train)
    308         """Call the model's `forward` function, returning the output and a
    309         callback to compute the gradients via backpropagation."""
--> 310         return self._func(self, X, is_train=is_train)
    311 
    312     def initialize(self, X: Optional[InT] = None, Y: Optional[OutT] = None) -> "Model":

/usr/local/lib/python3.11/dist-packages/thinc/layers/chain.py in forward(model, X, is_train)
     52     callbacks = []
     53     for layer in model.layers:
---> 54         Y, inc_layer_grad = layer(X, is_train=is_train)
     55         callbacks.append(inc_layer_grad)
     56         X = Y

/usr/local/lib/python3.11/dist-packages/thinc/model.py in __call__(self, X, is_train)
    308         """Call the model's `forward` function, returning the output and a
    309         callback to compute the gradients via backpropagation."""
--> 310         return self._func(self, X, is_train=is_train)
    311 
    312     def initialize(self, X: Optional[InT] = None, Y: Optional[OutT] = None) -> "Model":

/usr/local/lib/python3.11/dist-packages/thinc/layers/chain.py in forward(model, X, is_train)
     52     callbacks = []
     53     for layer in model.layers:
---> 54         Y, inc_layer_grad = layer(X, is_train=is_train)
     55         callbacks.append(inc_layer_grad)
     56         X = Y

/usr/local/lib/python3.11/dist-packages/thinc/model.py in __call__(self, X, is_train)
    308         """Call the model's `forward` function, returning the output and a
    309         callback to compute the gradients via backpropagation."""
--> 310         return self._func(self, X, is_train=is_train)
    311 
    312     def initialize(self, X: Optional[InT] = None, Y: Optional[OutT] = None) -> "Model":

/usr/local/lib/python3.11/dist-packages/thinc/layers/layernorm.py in forward(model, X, is_train)
     24     N, mu, var = _get_moments(model.ops, X)
     25     Xhat = (X - mu) * var ** (-1.0 / 2.0)
---> 26     Y, backprop_rescale = _begin_update_scale_shift(model, Xhat)
     27 
     28     def backprop(dY: InT) -> InT:

/usr/local/lib/python3.11/dist-packages/thinc/layers/layernorm.py in _begin_update_scale_shift(model, X)
     56 
     57 
---> 58 def _begin_update_scale_shift(model: Model[InT, InT], X: InT) -> Tuple[InT, Callable]:
     59     G = model.get_param("G")
     60     b = model.get_param("b")

KeyboardInterrupt:

import pandas as pd



# Explode the entities into separate rows
df_exploded = df.explode('entities')

# Create separate columns for entity text and type
df_exploded[['entity_text', 'entity_type']] = pd.DataFrame(df_exploded['entities'].tolist(), index=df_exploded.index)

# Drop rows without entities
df_exploded = df_exploded.dropna(subset=['entity_text', 'entity_type'])

# Aggregate data to count occurrences of each entity within each document type and entity type
entity_counts = df_exploded.groupby(['doc_type', 'entity_type', 'entity_text']).size().reset_index(name='count')

# Sort and take top N entities for each combination of doc_type and entity_type
top_n = 10  # You can change this value based on how many top items you want
top_entities = entity_counts.groupby(['doc_type', 'entity_type']).apply(lambda x: x.nlargest(top_n, 'count')).reset_index(drop=True)

# Function to get the top 5 entities, properly handling grouping columns
def get_top_n_entities_by_category(data, n=10):
    # Group by 'entity_type' first and then 'doc_type' within each entity type
    grouped = data.groupby(['entity_type', 'doc_type'])
    # Apply to get the largest n entries while retaining group columns
    top_n = grouped.apply(lambda x: x.nlargest(n, 'count')).reset_index(drop=True)
    return top_n

# Apply the function to the DataFrame
top_five_entities_by_category = get_top_n_entities_by_category(top_entities)

# Ensure the DataFrame still contains the necessary columns
print(top_five_entities_by_category.columns)

# Display the results grouped by NER category
for entity_type, group in top_five_entities_by_category.groupby('entity_type'):
    print(f'Entity Type: {entity_type}')
    for doc_type, sub_group in group.groupby('doc_type'):
        print(f'  Document Type: {doc_type}')
        for _, row in sub_group.iterrows():
            print(f"    {row['entity_text']}: {row['count']}")
    print()  # Add a newline for better separation

def find_entity_in_xml(entity, xml_folder_path):

    results = []

    # Loop through each XML file in the directory
    for xml_file in os.listdir(xml_folder_path):
        if xml_file.endswith('.xml'):
            file_path = os.path.join(xml_folder_path, xml_file)
            tree = etree.parse(file_path)
            root = tree.getroot()

            # Search for all text-containing elements; adjust the tag names to your XML structure
            for element in root.xpath('.//p | .//heading | .//Para | .//span'):
                if element.text and entity in element.text:
                    results.append({
                        'file': xml_file,
                        'entity': entity,
                        'context': element.text
                    })

    return pd.DataFrame(results)

xml_folder_path = 'XML'
entity = "British"

# Call the function
entity_occurrences_df = find_entity_in_xml(entity, xml_folder_path)

# Display the DataFrame
print(entity_occurrences_df)

# This takes about 15mn on the dataset, so you can ignore this cell and go to the next one

from spacytextblob.spacytextblob import SpacyTextBlob

# Load your spaCy model
nlp = spacy.load('en_core_web_lg')  # Make sure to use a model that matches your language needs

# Add TextBlob component to the spaCy pipeline
nlp.add_pipe('spacytextblob')

def analyze_sentiment(documents):
    """Analyzes sentiment of provided documents and returns polarity and subjectivity scores."""
    sentiments = {'polarity': [], 'subjectivity': []}

    for doc in documents:
        processed_doc = nlp(doc)
        sentiments['polarity'].append(processed_doc._.blob.polarity)
        sentiments['subjectivity'].append(processed_doc._.blob.subjectivity)

    return sentiments

# Assuming 'df' is your DataFrame with a 'text' column and a 'doc_type' column
df['sentiment'] = df['text'].apply(lambda x: analyze_sentiment([x]))

# Extract the first element of each list in the dictionaries for polarity and subjectivity
df['polarity'] = df['sentiment'].apply(lambda x: x['polarity'][0] if x['polarity'] else None)
df['subjectivity'] = df['sentiment'].apply(lambda x: x['subjectivity'][0] if x['subjectivity'] else None)

# Calculate the average polarity and subjectivity by document type
average_sentiments = df.groupby('doc_type')[['polarity', 'subjectivity']].mean()

print(average_sentiments)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

# Vectorize texts for LDA
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
text_data = vectorizer.fit_transform(df['processed_text'])

# Fit LDA model
lda = LatentDirichletAllocation(n_components=10, random_state=0)
lda.fit(text_data)

# Function to display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx + 1), " ".join([feature_names[i]
                                                      for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda, vectorizer.get_feature_names_out(), 10)

# After fitting the LDA model
topic_distributions = lda.transform(text_data)
# Find the topic number that has the highest probability for each document
df['dominant_topic'] = topic_distributions.argmax(axis=1) + 1  # Adding 1 to match the topic indices starting from 1

df.groupby("doc_type")["dominant_topic"].value_counts().unstack(level=0)

# Calculate the percentage of documents discussing each topic within each doc_type
# Group by 'doc_type' and 'dominant_topic' and count the number of documents
topic_counts = df.groupby(['doc_type', 'dominant_topic']).size().reset_index(name='count')

# Calculate the total number of documents in each doc_type
total_counts = df.groupby('doc_type').size().reset_index(name='total')

# Merge the counts to calculate percentages
topic_percentage = pd.merge(topic_counts, total_counts, on='doc_type')
topic_percentage['percentage'] = (topic_percentage['count'] / topic_percentage['total']) * 100

# Display the result
print(topic_percentage[['doc_type', 'dominant_topic', 'percentage']])

Natural Language Processing (NLP)

Preparation¶

Natural Language Processing¶

Topic Modelling¶

Iran-US Claims Tribunal¶

1. Named Entity Recognition¶

2. Sentiment Analysis¶

3 Topic Modelling¶