Trouble finding all URLs from a webpage using BeautifulSoup
I am having trouble webscraping a list of articles. My driver scrolls to the bottom of the page and pulls the full html. My main problem is that I am able to only pull the first url(href). Below is the code:
driver = webdriver.Firefox(executable_path=r'C:\geckodriver.exe')
site = "https://sputniknews.com/search/?query=Navalny"
def get_pages():
try:
scroll_pause_time = 1
driver.get(site)
WebDriverWait(driver, 15)
driver.find_element_by_xpath('/html/body/div[9]/div/div/div/div[3]/div[2]/button').click()
print('closed cookies window')
WebDriverWait(driver, 15)
driver.refresh
print('page refreshed')
driver.find_element_by_xpath('/html/body/div[3]/div/div[5]/div/div[1]/div[1]/div[3]').click()
print('closed news feed')
driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div/div[1]/div[3]').click()
print('clicked on more')
links = []
#measure scroll height
last_height = driver.execute_script("return document.documentElement.scrollHeight")\
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
print("break")
break
last_height = new_height
#get html of the scrolled page
html = driver.page_source
html = BeautifulSoup(html, "lxml")
articles = html.find('div', {'class': 'list__content'})
ankor_list = articles.findChildren('a')
links = []
for ankor in ankor_list:
url = ankor.get('href')
url = 'https://sputniknews.com' + url
if url not in links:
links.append(url)
print(url)
links = pd.DataFrame({'links' : links })
links = links.drop_duplicates(subset='links', keep='last', inplace=False)
print(links)
links.to_csv('links.csv')
return links
except NoSuchElementException:
pass
finally:
driver.quit()
You could simplify it like this:
html = BeautifulSoup(driver.page_source,'lxml')
links = []
for link in html.find_all('a',{'class':"list__title"}):
links.append('https://sputniknews.com'+link['href'])
This site has a backend request loading the data, you could simplify it massively by hitting that endpoint:
import requests
from bs4 import BeautifulSoup
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
for page in range(10):
url = f'https://sputniknews.com/services/search/getmore/?query=Navalny&offset={page*20}&tags=0'
resp = requests.get(url,headers=headers)
html = BeautifulSoup(resp.text,'html.parser')
links = []
for link in html.find_all('a',{'class':"list__title"}):
links.append('https://sputniknews.com'+link['href'])