Journal of Discourses Scraper
Python Google Colab code to scrape Journal of Discourses for text analysis.
# Import requests and BeautifulSoup libraries
import requests
from bs4 import BeautifulSoup
import re
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
# Define the base url for the journal of discourses
base_url = "https://josephsmithfoundation.org/journalofdiscourses/"
# Create an empty list to store the links to each volume
volume_links = []
# Loop through the volume numbers from 1 to 26
for i in range(1, 27):
# Append the volume number to the base url and add it to the list
volume_link = base_url + "topics/volumes/volume-" + str(i) + "/?print=print-search"
volume_links.append(volume_link)
volume_links
volume_text = []
# Loop through each volume link
for volume_link in volume_links:
print(volume_link)
# Get the html content of the discourse page
discourse_page = requests.get(volume_link)
discourse_page
# Parse the html content using BeautifulSoup
discourse_soup = BeautifulSoup(discourse_page.content, "html.parser")
for (title, text) in zip(discourse_soup.find_all("h1", class_="entry-title"), discourse_soup.find_all("div", class_="entry-content")):
text = str(text).strip().replace(',','')
text = re.sub('\n','',text)
text = re.sub('</div>','\n',text)
title = str(title).strip().replace(',','')
title = re.sub('.*?"entry-title">','',title)
title = re.sub('</h1>','',title)
heading = re.search(r'<em>(.*?)</em>',text).group(1)
text = re.sub('<em>.*?</em>','',text)
text = re.sub('<.*?>','',text)
row = title + ',' + heading + ',' + ',' + text
print(row)
volume_text.append(row)
df=pd.DataFrame( list(reader(volume_text)))
df
filename = 'JofD.csv'
df.to_csv('/content/drive/MyDrive/' + filename)