BeautifulSoup Scraping Elements Containing Certain Date
Using the datetime library is the best way to do this since it allows for easy comparison date/time comparison. I was able to implement it in your code. I left some comments to explain the code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
headers = []
datarows = []
# define 1-1-2020 as a datetime object
after_date = datetime(2020, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get('https://bitinfocharts.com/top-100-richest-dogecoin-addresses-2.html')
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
# check to make sure the date field isn't empty
if last_out_str != "":
# load date into datetime object for comparison (second part is defining the layout of the date as years-months-days hour:minute:second timezone)
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z")
# if check to see if the date is after 2020/1/1
if last_out > after_date:
address_links.append(url)
for url in address_links:
r = s.get(url)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
if table:
item = soup.find('h1').text
newitem = item.replace('Dogecoin', '')
finalitem = newitem.replace('Address', '')
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
Leave a comment if you have any questions about how it works that my comments didn't answer, I'd be happy to answer them!