from bs4 import BeautifulSoup
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
from matplotlib.colors import ListedColormap
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import plotly.express as px
import re
import plotly.offline as py
py.init_notebook_mode(connected=True)

# https://www.thetedkarchive.com/library/theo-slade-the-bombings-communications-of-ted-kaczynski-as-part-of-his-terror-campaign
# extract udocs from html
rows = []  
with open("./u-docs.html") as f:
    soup = BeautifulSoup(f, "html.parser")
for p in soup.find_all("p"):
    if not p.get_text(strip=True).startswith("Ted:"):
        continue

    blk = p.find_next("blockquote")
    if not blk:
        continue
    blkText = blk.get_text(strip=True, separator=" ")
    rows.append(("UB Letters", blkText))

# https://archive.nytimes.com/www.nytimes.com/library/national/unabom-manifesto-1.html
# extract uman from html
with open("./u-man.html") as f:
    soup = BeautifulSoup(f, "html.parser")
for p in soup.find_all("p"):
    text = p.get_text(strip=True, separator=" ")
    if re.match(r"\d+\.", text):
        rows.append(("UB Man", text))

# extract tdocs from html
with open("./t-docs.html") as f:
    soup = BeautifulSoup(f, "html.parser")
for h4 in soup.find_all("h4"):
    h4Text = h4.get_text()
    if h4Text.startswith("From Ted to") and "(T-" in h4Text:
        contents = []
        for sib in h4.next_siblings:
            if sib.name == "h4" or (sib.name == "h3" and sib.get_text() == "Sources"):
                break
            contents.append(sib.get_text(strip=True, separator=" "))
        rows.append(("Ted Letters", " ".join(contents)))
    if h4Text.startswith("From Dave"):
        contents = []
        for sib in h4.next_siblings:
            if sib.name == "h4" or (sib.name == "h3" and sib.get_text() == "Sources"):
                break
            contents.append(sib.get_text(strip=True, separator=" "))
        rows.append(("Dave Letters", " ".join(contents)))

# darwin letters for a control
# https://www.gutenberg.org/
with open("./darwin.html") as f:
    soup = BeautifulSoup(f, "html.parser")

p_tags = soup.find_all("p")

current_letter = None
contents = []

for p in p_tags:
    text = p.get_text(strip=True, separator=" ")

    if text.startswith("LETTER"):
        # save previous letter first
        if current_letter is not None and contents:
            rows.append(("Darwin Control", " ".join(contents)))

        # start new collection
        current_letter = text
        contents = []
        continue

    # inside letter
    if current_letter is not None:
        contents.append(text)

# save last letter
if current_letter is not None and contents:
    rows.append(("Darwin Control", " ".join(contents)))

df = pd.DataFrame(rows, columns=["Doc Type", "rawTxt"])

def clean(text):
    text = text.lower()
    tokens = text.split()
    return " ".join(tokens)


df["cleanTxt"] = df["rawTxt"].apply(clean)

# TF-IDF TO 3D PCA
# good for content similarity
# https://www.geeksforgeeks.org/machine-learning/understanding-tf-idf-term-frequency-inverse-document-frequency/
# https://www.ibm.com/think/topics/principal-component-analysis
X = TfidfVectorizer(max_features=5000, stop_words="english").fit_transform(
    df["cleanTxt"]
)

pca = PCA(n_components=3)
pcs = pca.fit_transform(X.toarray())

df["PC1"] = pcs[:, 0]
df["PC2"] = pcs[:, 1]
df["PC3"] = pcs[:, 2]

fig = px.scatter_3d(
    df,
    x="PC1",
    y="PC2",
    z="PC3",
    color="Doc Type", height=700,
    title="TF-IDF 3D PCA Bag of Words No Stop Words",
)

fig.show()

# avg sent len box whisker plot
def avgSent(text):
    sents = re.split(r"[.!?]", text)
    lengths = [len(s.split()) for s in sents]
    return sum(lengths) / len(lengths) if lengths else 0


df["Avg Sent Length"] = df["cleanTxt"].apply(avgSent)

# plotly box whisker on avg sent for each doc type
fig = px.box(
    df,
    x="Doc Type",
    y="Avg Sent Length",
    title="Average Sentence Length by Document Type",
    color="Doc Type", height=700,
    points="all",
)

fig.show()

# https://www.cambridge.org/core/elements/idea-of-progress-in-forensic-authorship-analysis/6A4F7668B4831CCD7DBF74DECA3EBA06
X = TfidfVectorizer(
    analyzer="char",
    ngram_range=(2, 6),
    min_df=2,
    max_features=10000,
).fit_transform(df["rawTxt"])

pca = PCA(n_components=3)
pcs = pca.fit_transform(X.toarray())

df["PC1"] = pcs[:, 0]
df["PC2"] = pcs[:, 1]
df["PC3"] = pcs[:, 2]

fig = px.scatter_3d(
    df,
    x="PC1",
    y="PC2",
    z="PC3",
    color="Doc Type", height=700,
    title="TF-IDF 3D PCA 6-grams Grieve (2007)",
)
fig.show()

X = TfidfVectorizer(max_features=5000).fit_transform(df["cleanTxt"])

pca = PCA(n_components=3)
pcs = pca.fit_transform(X, df[["Avg Sent Length"]].to_numpy())

df["PC1"] = pcs[:, 0]
df["PC2"] = pcs[:, 1]
df["PC3"] = pcs[:, 2]

fig = px.scatter_3d(
    df,
    x="PC1",
    y="PC2",
    z="PC3",
    color="Doc Type", height=700,
    title="TF-IDF 3D PCA Bag of Words with Stop Words and Sent Length",
)

fig.show()

# legit stylometry
# https://www.nature.com/articles/s41599-025-05986-3
# burrows delta mds scatter


# adapted from
# https://github.com/jamesosullivan/stylometry/blob/main/burrows-delta-dendrogram.py
# 1. Load texts from the folder
def load_texts():
    texts = {}
    for dt in df["Doc Type"].unique():
        i = 1
        for txt in df[df["Doc Type"] == dt]["rawTxt"]:
            texts[f"{dt}_{i}"] = txt
            i += 1
    return texts


# 2. Preprocess texts
def preprocess(text):
    tokens = word_tokenize(text.lower())  # Tokenise and lowercase
    filtered_tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    return filtered_tokens


# 3. Compute word frequencies
def compute_frequencies(tokenised_texts, mfw=100):
    all_tokens = []
    for tokens in tokenised_texts.values():
        all_tokens.extend(tokens)
    most_common_words = [
        word for word, _ in Counter(all_tokens).most_common(mfw)
    ]  # Most frequent words (MFW)

    frequencies = {}
    for name, tokens in tokenised_texts.items():
        word_counts = Counter(tokens)
        frequencies[name] = {word: word_counts[word] for word in most_common_words}
    return pd.DataFrame(frequencies).fillna(0)


# 4. Calculate z-scores
def calculate_z_scores(frequency_matrix):
    return frequency_matrix.apply(lambda col: (col - col.mean()) / col.std(), axis=1)


# 5. Compute Burrows's Delta
def compute_delta(z_matrix):
    delta_matrix = pd.DataFrame(index=z_matrix.columns, columns=z_matrix.columns)
    for text1 in z_matrix.columns:
        for text2 in z_matrix.columns:
            delta = np.mean(np.abs(z_matrix[text1] - z_matrix[text2]))
            delta_matrix.loc[text1, text2] = delta
    # Symmetrise the matrix
    delta_matrix = delta_matrix.fillna(0)  # Replace NaNs
    delta_matrix = (delta_matrix + delta_matrix.T) / 2  # Ensure symmetry
    np.fill_diagonal(delta_matrix.values, 0)  # Diagonal must be 0
    return delta_matrix


# 6. Extract Groups for Colour Coding
def extract_groups(filenames):
    """
    Extract groups from filenames based on the text before the first `_`.

    Args:
        filenames (list): List of filenames.

    Returns:
        list: Groups for each filename.
    """
    return [filename.split("_")[0] for filename in filenames]


# 7. Visualise Delta Matrix with Colour-Coded Dendrogram
def plot_coloured_dendrogram(delta_matrix, groups, save_as=None):
    """
    Visualise the Burrows's Delta matrix using a colour-coded dendrogram.

    Args:
        delta_matrix (pd.DataFrame): Pairwise distances between texts.
        groups (list): Groups for colour coding.
        save_as (str, optional): File path to save the plot. Defaults to None.
    """
    # Convert the Delta matrix to a condensed distance matrix
    condensed_matrix = squareform(delta_matrix.values)

    # Perform hierarchical clustering
    linkage_matrix = linkage(condensed_matrix, method="average")

    # Map groups to colours
    unique_groups = list(set(groups))
    cmap = ListedColormap(plt.cm.tab10(np.linspace(0, 1, len(unique_groups))))
    colours = {
        group: cmap(i / len(unique_groups)) for i, group in enumerate(unique_groups)
    }

    # Create the dendrogram with colour-coded labels
    plt.figure(figsize=(12, 10))
    dendrogram(
        linkage_matrix,
        labels=delta_matrix.columns,
        leaf_rotation=90,
        leaf_font_size=10,
        color_threshold=0,
    )

    # Apply colour coding to the labels
    ax = plt.gca()
    xlbls = ax.get_xmajorticklabels()
    for lbl in xlbls:
        group = lbl.get_text().split("_")[0]
        lbl.set_color(colours[group])

    # Add titles, labels, and legend
    plt.title("Burrows's Delta")
    plt.xlabel("Texts")
    plt.ylabel("Distance")
    plt.tight_layout()

    # Save or show plot
    if save_as:
        plt.savefig(save_as)
        print(f"Dendrogram saved as '{save_as}'.")
    plt.show()


# Load, preprocess, and analyse texts
texts = load_texts()
preprocessed_texts = {key: preprocess(value) for key, value in texts.items()}
frequency_matrix = compute_frequencies(preprocessed_texts, mfw=100)  # MFW set to 100
z_scores = calculate_z_scores(frequency_matrix)
delta_matrix = compute_delta(z_scores)

# Extract groups for colour coding
groups = extract_groups(delta_matrix.columns)

# Plot Colour-Coded Dendrogram
plot_coloured_dendrogram(delta_matrix, groups)

/tmp/ipykernel_586478/3871713318.py:55: FutureWarning:

Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

Applying Stylometry to the Unabomber Case¶

Background¶

Ethical Concerns¶

Imports¶

Parsing¶

Unabomber Letters¶

Unabomber Manifesto¶

Ted and Dave Letters¶

Charles Darwin Letters¶

Plot 1¶

Plot 2¶

Plot 3¶

Plot 4¶

Plot 5¶

Conclusion¶

Citations¶