Dynamically scrape paginated table with BeautifulSoup and store results in csv?
I've made some changes to your script which should make it easier to debug and to maintain. It uses pandas to make writing to CSV easier as well as concurrent.futures to speed things up. If you have questions let me know, basically each year is scraped concurrently, I scrape the first page to get the number of pages to scrape, then I loop over each page and parse the HTML. The key pieces of info are put into a dictionary then added to a list (which is easier to write to csv via pandas since it's basically a dataframe already - a list of dicts)
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
info = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for index,row in enumerate(table.find_all('tr',{'class':regex})):
if index == 0:
continue
acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
df= pd.DataFrame(info)
df.to_csv(f'{year}_aviation-safety.csv',index=False)
if __name__ == "__main__":
START = 1916
STOP = 2022
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)
What happens?
First of all, always look into the soup - therein lies the truth.
You are missing the headers in the request of the while loop, that causes an 403
error and the table is not select correct.
How to achieve?
Set the headers for your request in the while loop correct:
html = requests.get(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
Select your rows more specific - Note there is no tbody
in the html:
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table tr.list'):
Also check the selectors for the pagination:
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current + a'):
url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
break
Example
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])
while True:
print(url)
html = requests.get(url , headers = headers)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table tr.list'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current + a'):
url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
break
Just in case
Alternative solution with pandas.read_html()
that iterates over all years:
import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
url = 'https://aviation-safety.net/wikibase/'
req = requests.get(url , headers = headers)
soup = BeautifulSoup(req.text, 'html.parser')
data = []
for url in ['https://aviation-safety.net/'+a['href'] for a in soup.select('a[href*="/wikibase/dblist.php"]')]:
while True:
html = requests.get(url, headers = headers)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.prettify())[0])
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current + a'):
url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
break
time.sleep(random.random())
df = pd.concat(data)
df.loc[:, ~df.columns.str.contains('^Unnamed')].to_csv('aviation-safety.csv', index=False)