Dynamically scrape paginated table with BeautifulSoup and store results in csv?

I've made some changes to your script which should make it easier to debug and to maintain. It uses pandas to make writing to CSV easier as well as concurrent.futures to speed things up. If you have questions let me know, basically each year is scraped concurrently, I scrape the first page to get the number of pages to scrape, then I loop over each page and parse the HTML. The key pieces of info are put into a dictionary then added to a list (which is easier to write to csv via pandas since it's basically a dataframe already - a list of dicts)

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

def scrape_year(year):

    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})
    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])

    info = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
        print(new_url)

        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table',{'class':'hp'})


        regex = re.compile('list.*')
        for index,row in enumerate(table.find_all('tr',{'class':regex})):
            if index == 0:
                continue

            acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
            try:
                acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
            except ValueError:
                try:
                    acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    try:
                        acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                    except ValueError:
                        continue

            acc_type = row.find_all('td')[1].text
            acc_reg = row.find_all('td')[2].text
            acc_operator = row.find_all('td')[3].text
            acc_fat = row.find_all('td')[4].text
            acc_location = row.find_all('td')[5].text
            acc_dmg = row.find_all('td')[7].text

            item = {
                'acc_link' : acc_link,
                'acc_date': acc_date,
                'acc_type': acc_type,
                'acc_reg': acc_reg,
                'acc_operator' :acc_operator,
                'acc_fat':acc_fat,
                'acc_location':acc_location,
                'acc_dmg':acc_dmg
                }

            info.append(item)

    df= pd.DataFrame(info)
    df.to_csv(f'{year}_aviation-safety.csv',index=False)


if __name__ == "__main__":

    START = 1916
    STOP = 2022

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
            final_list = executor.map(scrape_year,years)

What happens?

First of all, always look into the soup - therein lies the truth.

You are missing the headers in the request of the while loop, that causes an 403 error and the table is not select correct.

How to achieve?

Set the headers for your request in the while loop correct:

html = requests.get(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})

Select your rows more specific - Note there is no tbody in the html:

        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table tr.list'):

Also check the selectors for the pagination:

# If more than one page then iterate through all of them        
if soup.select_one('div.pagenumbers span.current + a'):
    url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
    break

Example

import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request

url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}

with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])

    while True:
        print(url)
        html = requests.get(url , headers = headers)
        soup = BeautifulSoup(html.text, 'html.parser')

        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table tr.list'):
            writer.writerow([c.text if c.text else '' for c in row.select('td')])
            print(row)

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers span.current + a'):
            url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
        else:
            break

Just in case

Alternative solution with pandas.read_html() that iterates over all years:

import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
url = 'https://aviation-safety.net/wikibase/'
req = requests.get(url , headers = headers)
soup = BeautifulSoup(req.text, 'html.parser')


data = []

for url in ['https://aviation-safety.net/'+a['href'] for a in soup.select('a[href*="/wikibase/dblist.php"]')]:
    while True:

        html = requests.get(url, headers = headers)
        soup = BeautifulSoup(html.text, 'html.parser')

        data.append(pd.read_html(soup.prettify())[0])

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers span.current + a'):
            url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
        else:
            break
        time.sleep(random.random())

df = pd.concat(data)
df.loc[:, ~df.columns.str.contains('^Unnamed')].to_csv('aviation-safety.csv', index=False)

How to see debug console when app installed through apk in flutter?

history.replace in react-router-dom v6

How to randomly partition a list into n nearly equal parts?

How to output the currently connected WiFi password directly with a PowerShell?

Why is it not possible to store DD-MM-YYYY format in date format?

JSRuntime.InvokeVoidAsync in Blazor Server app calling multiple functions when I only tell it to call one

what does x for x means in python? what is the difference between 'for x ' and 'x for x' [duplicate]

move only if file exists in a shell script

Git - Use port 9418 for remote on github

Select query within more than 150 different conditions in where clause

Namespace stuck in Terminating state with one finalizer named kubernetes. Where can I find documentation about it?

Python wait for processes in multiprocessing Pool to complete without either closing Pool or use map()