# Imports
import os
import pandas as pd
import numpy as np
from glob import glob
import spacy
from collections import Counter, defaultdict
from nltk import sent_tokenize, word_tokenize
import string
from nltk.corpus import stopwords
import seaborn as sns
from   sklearn.preprocessing import StandardScaler
from nltk.tokenize.treebank import TreebankWordDetokenizer

# File locations
#   Note that metadata are supplied as a TSV file
#   Text files are in a directory, one file per (long, novel-like) document
metadata_file = os.path.join('data', 'us_fiction', 'corpus_data.tsv')
text_dir      = os.path.join('data', 'us_fiction', 'us_texts')

# Load the metadata
metadata = pd.read_csv(
    metadata_file, 
    sep='\t',
    low_memory=False
).set_index('source_id')

# Subset metadata into only female authors
fem=metadata[metadata['gender']=='F']
fem.describe()

print('fraction of novels written by the top ten authors: {:.3f}%'.format(
    fem.author.value_counts(normalize=True)[:10].sum()*100))

fraction of novels written by the top ten authors: 33.095%

fem.head()

fem.loc[:, 'label']=[1 if fem['pub_date'][i]>=1848 else 0 for i in range(len(fem))]
print('baseline: {}'.format(fem['label'].sum()/len(fem)))

baseline: 0.8952380952380953

# Distribution of publication dates
metadata.pub_date.plot.hist(bins=metadata.pub_date.max()-metadata.pub_date.min()+1);

novel_files = glob(os.path.join(text_dir, '*'))

fem_files=[]
for f in fem.index:
    for n in novel_files:
        title=n.split('/')[-1]
        if f==title:
            fem_files.append(n)
len(fem_files)

420

all_novels={}
for n in fem_files:
    title=n.split('/')[-1]
    with open(n, 'r') as f:
        novel_text = f.read()
        all_novels[title]=novel_text

punct=list(string.punctuation)

all_chunks = {}
years=[]
labels=[]
nest_chunks=[]
for file in fem_files:
    title=file.split('/')[-1]
    wordcount=fem.loc[title, 'words'] # use wordcount from metadata to simplify
    chunk_length=round(wordcount/10)
    
    with open(file, 'r') as f:
        counter=0 # counter to keep track of token count
        chunks=[] # nested list to store chunks from each novel
        working_tokens=[] # working token list to create chunks
        lines=f.readlines()
        
        for line in lines:
            tokens = word_tokenize(line) # tokenize by line
            for token in tokens: # loop over tokens
                working_tokens.append(token.lower())
                if token not in punct: # make sure token is not punctuation 
                    counter+=1 # keep track of token count
                if counter==chunk_length: # create a chunk when counter>chunk length
                    counter=0 
                    chunks.append(TreebankWordDetokenizer().detokenize(working_tokens)) # turn into sentences
                    years.append(fem.loc[title, 'pub_date'])
                    labels.append(fem.loc[title, 'label'])
                    working_tokens=[]  
            if line is lines[-1]: # account for last chunk being uneven
                chunks.append(TreebankWordDetokenizer().detokenize(working_tokens))  
                years.append(fem.loc[title, 'pub_date'])
                labels.append(fem.loc[title, 'label'])
    all_chunks[title]=chunks
    nest_chunks+=chunks
len(all_chunks)

420

chunked_df=pd.DataFrame.from_dict(all_chunks, orient='index')
new=chunked_df.merge(fem[['pub_date']], left_index=True, right_index=True)
new.head()

from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(
    input = 'content',
    encoding = 'utf-8',
    strip_accents = 'unicode',
    stop_words='english',
    lowercase = True,
    max_df = .75,
    min_df = 0.01
)

X = vectorizer.fit_transform(nest_chunks)
print("Feature matrix shape:", X.shape)
print("Total vectorized words in the corpus:", X.sum())
print("Average vectorized chunk length:", int(X.sum()/X.shape[0]), "tokens")

Feature matrix shape: (4608, 16684)
Total vectorized words in the corpus: 10769245
Average vectorized chunk length: 2337 tokens

from   sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(
    n_components=20, # Number of topics to find
    n_jobs=-1,       # Use all CPU cores
    verbose=1,       # Print progress
    max_iter=50,     # Might want more in production work
    evaluate_every=0 # Set >=1 to test for convergence (slow, but can stop iteration)
)

lda.fit(X)

iteration: 1 of max_iter: 50
iteration: 2 of max_iter: 50
iteration: 3 of max_iter: 50
iteration: 4 of max_iter: 50
iteration: 5 of max_iter: 50
iteration: 6 of max_iter: 50
iteration: 7 of max_iter: 50
iteration: 8 of max_iter: 50
iteration: 9 of max_iter: 50
iteration: 10 of max_iter: 50
iteration: 11 of max_iter: 50
iteration: 12 of max_iter: 50
iteration: 13 of max_iter: 50
iteration: 14 of max_iter: 50
iteration: 15 of max_iter: 50
iteration: 16 of max_iter: 50
iteration: 17 of max_iter: 50
iteration: 18 of max_iter: 50
iteration: 19 of max_iter: 50
iteration: 20 of max_iter: 50
iteration: 21 of max_iter: 50
iteration: 22 of max_iter: 50
iteration: 23 of max_iter: 50
iteration: 24 of max_iter: 50
iteration: 25 of max_iter: 50
iteration: 26 of max_iter: 50
iteration: 27 of max_iter: 50
iteration: 28 of max_iter: 50
iteration: 29 of max_iter: 50
iteration: 30 of max_iter: 50
iteration: 31 of max_iter: 50
iteration: 32 of max_iter: 50
iteration: 33 of max_iter: 50
iteration: 34 of max_iter: 50
iteration: 35 of max_iter: 50
iteration: 36 of max_iter: 50
iteration: 37 of max_iter: 50
iteration: 38 of max_iter: 50
iteration: 39 of max_iter: 50
iteration: 40 of max_iter: 50
iteration: 41 of max_iter: 50
iteration: 42 of max_iter: 50
iteration: 43 of max_iter: 50
iteration: 44 of max_iter: 50
iteration: 45 of max_iter: 50
iteration: 46 of max_iter: 50
iteration: 47 of max_iter: 50
iteration: 48 of max_iter: 50
iteration: 49 of max_iter: 50
iteration: 50 of max_iter: 50

LatentDirichletAllocation(evaluate_every=0, max_iter=50, n_components=20,
                          n_jobs=-1, verbose=1)

# from topic modeling lecture
def print_top_words(model, feature_names, n_top_words, hide_stops=False):
    if hide_stops:
        from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic {topic_idx: >2}: "
        top_words_idx = topic.argsort()
        if not hide_stops:
            top_words = [feature_names[i]
                         for i in top_words_idx[:-n_top_words - 1:-1]]
        else:
            top_words = []
            i = 1
            while len(top_words) < n_top_words:
                if feature_names[top_words_idx[-i]] not in ENGLISH_STOP_WORDS:
                    top_words.append(feature_names[top_words_idx[-i]])
                i += 1
        message += " ".join(top_words)    
        print(message)
    print()

print_top_words(lda, vectorizer.get_feature_names_out(), n_top_words=10, hide_stops=True)

Topic  0: thou thy thee sweet spirit heaven beauty earth eye deep
Topic  1: alice bessie frank laura herbert willie replied sister hamilton ashley
Topic  2: ll tom yer got dat em ye miss ve master
Topic  3: church lord holy mary christ sister prayer faith christian heaven
Topic  4: philip miss susan answered lucia honor thoughts spoke need power
Topic  5: water boy black trees road sea horse red wind sun
Topic  6: elsie ll loved dead matter hard miss ve lips marry
Topic  7: mary ll says doctor got kate money ruth bed boy
Topic  8: ll reply boy daisy ve replied warren glad business son
Topic  9: agnes mabel robert uncle john julia fanny charles lydia richard
Topic 10: ellen lucy aunt clara mary isabel cora miss constance esther
Topic 11: captain sir colonel lord miss sybil exclaimed general heaven inquired
Topic 12: helen replied daughter son power fate future placed fortune louise
Topic 13: happiness replied feelings edith husband affection continued manner loved florence
Topic 14: lips fell cried arms answered wild pale dead bed husband
Topic 15: jane country replied received miss party indian till eye son
Topic 16: harry lily miss amy owen arnold nancy adele marcia ophelia
Topic 17: women society human power country eva self fact state certain
Topic 18: miss pretty ladies dress music cousin beauty gentleman hair party
Topic 19: margaret maud roland effie allan bruce irene edmund st bishop

import warnings

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    doc_topic_matrix = lda.transform(X)
print("Doc-topic matrix shape:", doc_topic_matrix.shape)

Doc-topic matrix shape: (4608, 20)

sorted_df=new[[0,1,2,3,4,5,6,7,8,9,10]].T
sorted_df=sorted_df.fillna(' ')

# Examine dominant topics per text
def find_dominant(model, df, vectorizer):
    """
    Returns DataFrame labeled with the dominant topic of a document
    """
    labeled_df=pd.DataFrame()
    for c in df.columns:
        new_lst=[' '.join(sorted_df[c])]
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            proba = model.transform(vectorizer.transform(new_lst))
        for i in range(len(proba[0])):
            if proba[0][i]==proba[0].max():
                labeled_df.loc[c, 'dom_top']=i
                labeled_df.loc[c, 'top_prob']=proba[0][i]
    return labeled_df

dom_df=find_dominant(lda, sorted_df, vectorizer)
dom_df.head()

new_fem=fem.merge(dom_df, left_index=True, right_index=True)

plt.figure(figsize=(10,8))
g=sns.histplot(x='dom_top', hue='label', data=new_fem, multiple='stack',
             palette='pastel', binwidth=1, stat='density', common_norm=False)
g.set_xticks(range(20), labels=range(20))
plt.title('Dominant Topic Labels by Time Period')
plt.xlabel('Dominant Topic')
plt.legend(title='Year', loc='upper right', labels=['1848 and After', 'Before 1848'])
plt.show()

no_15=len(new_fem[(new_fem['dom_top']==15) & (new_fem['label']==0)])
print(f'number of novels written before 1848 with dominant topic 15: {no_15}')

number of novels written before 1848 with dominant topic 15: 23

fig, axs = plt.subplots(5, 4, figsize=(30,20))

col=0
row=0
for topic_idx, topic in enumerate(lda.components_):
        message = f"Topic {topic_idx: >2}: "
        top_words_idx = topic.argsort()
        word_scores=lda.components_[topic_idx]
        feats_names = vectorizer.get_feature_names_out()
        
        top_words = [feats_names[i]
                         for i in top_words_idx[:-10 - 1:-1]]
        
        top_scores = [word_scores[i]
                         for i in top_words_idx[:-10 - 1:-1]]
        
        ax1=sns.barplot(x=top_words, y=top_scores, ax=axs[row, col], palette='Accent')
    
        if row==4:
            col+=1
            
        row+=1

            
        if row>4:
            row=0
            
        
        ax1.title.set_text(f'Topic {topic_idx}')
        plt.tight_layout()

# Examine topic distribution in a random chunk
import random
doc_idx = random.randrange(len(doc_topic_matrix))
print(f"Topic distributions in chunk {doc_idx} ({years[doc_idx]})")
print("Sample of the chunk:\n", nest_chunks[doc_idx][:2000], "\n")
for i in range(len(doc_topic_matrix[0])):
        if doc_topic_matrix[doc_idx][i]==doc_topic_matrix[doc_idx].max():
            print(f'Dominant Topic {i}')
print()
print_top_words(lda, vectorizer.get_feature_names_out(), n_top_words=10, hide_stops=False)

Topic distributions in chunk 1147 (1859)
Sample of the chunk:
 medora in the boudoir at lazy-bank, his restless excitement was such, that he could scarcely remain quietly seated . she came in, looking a shade paler than usual, and met him with an embarrassment in her manner almost equal to his own . little did he suspect the weary vigil she had kept last night, in endeavoring to school herself to the calm contemplation of accepting the offer that she felt sure would the next day he made, or that she met him with the deliberate intention of not committing herself until she was well assured that his worldly prospects were as brilliant as she had been led to suppose . floyd ventured to seat himself beside her on the sofa, and then with trembling lips began: — "miss medora, you can readily guess that since last night all my thoughts have been full of you, and that i have come here this morning to tell you how much i love you ." then glancing at her downcast eyes, he went on hurriedly . do not condemn my selfishness, though i have nothing now to give you but a true and faithful heart . i should not have ventured to address you had i not been, until to-day, ignorant of my real situation ." the color which his words had ripened to a blush in medora's cheeks faded slowly away, and she released herself from the trembling arm that he had stolen about her waist . at this act, floyd, who had paused, awaiting her reply in breathless silence, exclaimed passionately: — "oh, medora, do not repulse me; only, for pity's sake, tell me that you love me a little ." she glanced at his convulsed features, and touched by the depth of real feeling they expressed, said gently— "my friend, do not agitate yourself so much; let us speak calmly on this subject, so important to us both ." "i will do anything you choose," replied floyd, with more composure, "if you will only tell me that you love me ." without noticing this appeal, medora asked: "why do you talk of your selfishness towards me? i have ever found you a thoughtful friend ." "i trust i should alw 

Dominant Topic 13

Topic  0: thou thy thee sweet spirit heaven beauty earth eye deep
Topic  1: alice bessie frank laura herbert willie replied sister hamilton ashley
Topic  2: ll tom yer got dat em ye miss ve master
Topic  3: church lord holy mary christ sister prayer faith christian heaven
Topic  4: philip miss susan answered lucia honor thoughts spoke need power
Topic  5: water boy black trees road sea horse red wind sun
Topic  6: elsie ll loved dead matter hard miss ve lips marry
Topic  7: mary ll says doctor got kate money ruth bed boy
Topic  8: ll reply boy daisy ve replied warren glad business son
Topic  9: agnes mabel robert uncle john julia fanny charles lydia richard
Topic 10: ellen lucy aunt clara mary isabel cora miss constance esther
Topic 11: captain sir colonel lord miss sybil exclaimed general heaven inquired
Topic 12: helen replied daughter son power fate future placed fortune louise
Topic 13: happiness replied feelings edith husband affection continued manner loved florence
Topic 14: lips fell cried arms answered wild pale dead bed husband
Topic 15: jane country replied received miss party indian till eye son
Topic 16: harry lily miss amy owen arnold nancy adele marcia ophelia
Topic 17: women society human power country eva self fact state certain
Topic 18: miss pretty ladies dress music cousin beauty gentleman hair party
Topic 19: margaret maud roland effie allan bruce irene edmund st bishop

def find_word_sims(vocab, nlp, sim_word):
    """
    Returns DataFrame with cosine similarities between all words to the target similarity word
    """
    vector_count = 0 # check to make sure all words have vectors
    for v in vocab+[sim_word]:
        if v in nlp.vocab.strings:
            if nlp.vocab[v].has_vector:
                vector_count+=1
    # Make a word-vector matrix with labels
    vector_matrix = np.zeros([vector_count,nlp.vocab.vectors_length]) # Initialize the output matrix
    counter = 0
    vocab_dict = {} # Dictionary to hold word index positions in the matrix
    vocab_list = [] # List to hold words in order
    for v in vocab+[sim_word]:
        if v in nlp.vocab.strings:
            if nlp.vocab[v].has_vector: # only want the ones with embeddings
                vocab_dict[v] = counter # record position of this word
                vocab_list.append(v) # add to our list of words
                # l2-normalize the vector and update matrix
                vector_matrix[counter] = nlp.vocab[v].vector/nlp.vocab[v].vector_norm
                counter+=1 # increment counter
                
    similarities = np.dot(vector_matrix, vector_matrix[vocab_dict[sim_word]])
    
    top_n = np.argsort(similarities)[-10:][::-1] 
    # print(f'top ten words and their similarity to the word {sim_word}:\n')
    df=pd.DataFrame()
    for i in top_n:
        if vocab_list[i] not in sim_words:
            word=vocab_list[i]
            df.loc[word, 'sim']= similarities[i]
            df.loc[:, 'sim_word']=sim_word
    return df

vocab=[]
for topic_idx, topic in enumerate(lda.components_):
        message = f"Topic {topic_idx: >2}: "
        top_words_idx = topic.argsort()
        word_scores=lda.components_[topic_idx]
        feats_names = vectorizer.get_feature_names_out()
        
        top_words = [feats_names[i]
                         for i in top_words_idx[:-10 - 1:-1]]
        if topic_idx==13 or topic_idx==15:
            vocab.append([word for word in top_words])

nlp=spacy.load('en_core_web_lg')

sim_words=['family', 'woman', 'strong', 'home']

# TOPIC 13
fig, axs = plt.subplots(4, 1, figsize=(10,15))
for i in range(len(sim_words)):
    df=find_word_sims(vocab[0], nlp, sim_words[i]).reset_index()
    
    
    ax1= sns.barplot(x='index', y='sim', data=df, ax=axs[i], palette='Accent', order=vocab[0])
    ax1.title.set_text(f'Topic 13 Word Similarities to {sim_words[i]}')
    ax1.set_ylim(0,1)
    ax1.set_xlabel('top 10 topic words')
    ax1.set_ylabel('cosine similarity')
    plt.tight_layout()

# TOPIC 15
fig, axs = plt.subplots(4, 1, figsize=(10,15))
for i in range(len(sim_words)):
    df=find_word_sims(vocab[1], nlp, sim_words[i]).reset_index()
    
    
    ax1= sns.barplot(x='index', y='sim', data=df, ax=axs[i], palette='Accent', order=vocab[1])
    ax1.title.set_text(f'Topic 15 Word Similarities to {sim_words[i]}')
    ax1.set_ylim(0,1)
    ax1.set_xlabel('top 10 topic words')
    ax1.set_ylabel('cosine similarity')
    plt.tight_layout()

# Predict novel date from topic content
from   sklearn.linear_model import LinearRegression
from   sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

# Fit and predict using topics
X_topics = StandardScaler().fit_transform(doc_topic_matrix)
predictor = LinearRegression().fit(X_topics, years)
y_pred = predictor.predict(X_topics)

# Score
print("Mean cross-validated R^2 (topics):", round(np.mean(cross_val_score(predictor,
                                                                          X_topics,
                                                                          years,
                                                                          scoring='r2',
                                                                          cv=10)),3))

# Plot
fig,ax = plt.subplots(figsize=(12,8))
sns.regplot(x=years, y=y_pred, scatter_kws={'alpha':0.1})
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Topics as features")
plt.show()

Mean cross-validated R^2 (topics): -0.482

# Predict novel label as before or after 1848 from topic content
from   sklearn.linear_model import LogisticRegression

# Fit and predict using topics
predictor_log = LogisticRegression(max_iter=300).fit(X_topics, labels)

# Score
print("Mean accuracy (topics):", round(np.mean(cross_val_score(predictor_log,
                                                               X_topics,
                                                               labels,
                                                               scoring='accuracy',
                                                               cv=10)),3))

Mean accuracy (topics): 0.916

	pub_date	gender_guess	born	died	words
count	420.000000	420.000000	307.000000	294.000000	420.000000
mean	1860.038095	0.288095	1816.446254	1885.911565	85458.197619
std	12.452621	0.453416	15.356745	16.712761	40334.030996
min	1797.000000	0.000000	1759.000000	1840.000000	4521.000000
25%	1854.000000	0.000000	1809.000000	1874.000000	57685.750000
50%	1862.000000	0.000000	1818.000000	1886.000000	81393.500000
75%	1870.000000	1.000000	1827.000000	1896.750000	111335.500000
max	1875.000000	1.000000	1854.000000	1932.000000	247258.000000

	author	title	pub_place	publisher	pub_date	gender	gender_guess	ethnicity	occupation	occupation_free	state_born	state_main	state_died	born	died	words
source_id
eaf002	Bacon, Delia Salter	Tales of the puritans	New Haven [Conn.]	A. H. Maltby	1831	F	0.0	White	Education	Teacher	OH	CT	CT	1811.0	1859.0	70010
eaf003	Bacon, Delia Salter	Love's martyr	Cincinnati	Printed by E. Morgan and Co.	1838	F	0.0	White	Education	Teacher	OH	CT	CT	1811.0	1859.0	13547
eaf004	Bacon, Delia Salter	The bride of Fort Edward	New York	Samuel Colman	1839	F	0.0	White	Education	Teacher	OH	CT	CT	1811.0	1859.0	34309
eaf026	Brooks, Maria Gowen	Idomen, or, The vale of Yumuri	New York	Samuel Colman	1843	F	0.0	White	Writer	Poet	MA	MA	Cuba	1794.0	1845.0	48844
eaf041	Child, Lydia Maria Francis	Hobomok	Boston	Cummings, Hilliard & Co.	1824	F	0.0	White	Politics-Government-Activism	Activist	MA	MA	MA	1802.0	1880.0	56056

	0	1	2	3	4	5	6	7	8	9	10	pub_date
eaf002	the regicides we dig no lands for tyrants but ...	, move your chair for mr. russel . margaret ga...	and there hollowed into deep recesses which pa...	companions, or even cherished in her secret he...	no fancy then; that deep groan had borne its o...	more noble than a resting place in the tombs o...	by which she had leaned a few hours since, was...	dwelling, which she was that moment passing . ...	her in french with ease and fluency . listen, ...	remember a sin of far more deadly hue than aug...	None	1831
eaf003	the loved, the hated, the adored, each mortal,...	bright attributes that the common experience o...	family as nothing utterly worthless, that you ...	of these savages . the unconscious and bewilde...	home with him to england, and told me how happ...	window . the sound of retiring steps soon echo...	had last vanished listening breathlessly for s...	soon lost to her view, as with that strange an...	hill, preparing to scalp and murder each other...	to the left, in a small glen, but as the level...	to cloud the brightness of the eternal noontid...	1838
eaf004	2 the bride of fort edward . part first . indu...	blowing without there, wasting for ever; and n...	and stones of this dull earth were precious to...	.' surely the bitterness is deep when that whi...	, my lady! here's some one at the gate . (an o...	? this don't look much like it . 4 th sol . i ...	meet now, we are parted for ever; if i do not ...	speedily report our absence . 2nd sol . well, ...	as good . yes sir, yes sir, they are flocking ...	could not be . they told us she was murdered, ...	None	1839
eaf026	recital . the fireside . various misfortunes h...	their short robes or tunics of clean linen, bo...	; and that his voluntary absence from a more h...	too cold marble before the picture of idomen; ...	hare and ptarmigan were seen in the sparkling ...	black bearing my usual impression . ` i looked...	of a desolate soul! power even to seek the gra...	confessing to him all i had felt; but the powe...	, said ethelwald, your address, and you shall ...	that roundness of form most remarked in the la...	None	1843
eaf041	1 * hobomok . chap . i . how daur ye try sic s...	not worth the tears, which an onion draweth fo...	gratefully partaken, and all john's exploits i...	was considered as ishmael in the house of abra...	a controversial discussion with the plymouth e...	into her apartment, and hiding her face in the...	abide with you, on account of her sometime acq...	, in an agitated voice . verily, my dear wife,...	, as she uttered some mournful and incoherent ...	of his heart . he had proceeded near half a mi...	as he forgiveth you . and nowe, god in his mer...	1824

	dom_top	top_prob
eaf002	0.0	0.291249
eaf003	0.0	0.385452
eaf004	0.0	0.394618
eaf026	0.0	0.422981
eaf041	15.0	0.291122

1. Introduction and Hypothesis¶

2. Corpus, data, and methods¶

Corpus and Data¶

Methods¶

Pre-processing¶

Chunking¶

Vectorizer to LDA¶

Doc-topic Matrix¶

Dominant Topics and Keyword Weighting¶

Embedding Similarities¶

Regression and Classification¶

3. Results¶

4. Discussion and Conclusions¶

5. References¶