Trouble finding all URLs from a webpage using BeautifulSoup

I am having trouble webscraping a list of articles. My driver scrolls to the bottom of the page and pulls the full html. My main problem is that I am able to only pull the first url(href). Below is the code:

driver = webdriver.Firefox(executable_path=r'C:\geckodriver.exe')

site = "https://sputniknews.com/search/?query=Navalny"


def get_pages():

        try:

                scroll_pause_time = 1

                driver.get(site)
                WebDriverWait(driver, 15)
                driver.find_element_by_xpath('/html/body/div[9]/div/div/div/div[3]/div[2]/button').click()
                print('closed cookies window')
                WebDriverWait(driver, 15)
                driver.refresh
                print('page refreshed')

                driver.find_element_by_xpath('/html/body/div[3]/div/div[5]/div/div[1]/div[1]/div[3]').click()
                print('closed news feed')
                driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div/div[1]/div[3]').click()
                print('clicked on more')
                links = []
                
                #measure scroll height
                        

                last_height = driver.execute_script("return document.documentElement.scrollHeight")\

                while True:
                        # Scroll down to bottom

                        driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")

                        # Wait to load page

                        time.sleep(scroll_pause_time)

                        # Calculate new scroll height and compare with last scroll height

                        new_height = driver.execute_script("return document.documentElement.scrollHeight")
                        if new_height == last_height:
                                print("break")
                                break

                        last_height = new_height


                        #get html of the scrolled page

                html = driver.page_source
                html = BeautifulSoup(html, "lxml")
                articles = html.find('div', {'class': 'list__content'})
                
                ankor_list = articles.findChildren('a')

                links = []
        
                for ankor in ankor_list:
                        url = ankor.get('href')
                        url = 'https://sputniknews.com' + url
            
                        if url not in links:
                                links.append(url)
                                print(url)
                
                       
                links = pd.DataFrame({'links' : links })
                links = links.drop_duplicates(subset='links', keep='last', inplace=False)
                print(links)
                links.to_csv('links.csv')
                return links
        

        except NoSuchElementException:
                pass
    
    
        finally:
                driver.quit()

You could simplify it like this:

html = BeautifulSoup(driver.page_source,'lxml')

links = []
for link in html.find_all('a',{'class':"list__title"}):
    links.append('https://sputniknews.com'+link['href'])

This site has a backend request loading the data, you could simplify it massively by hitting that endpoint:

import requests
from bs4 import BeautifulSoup

headers =   {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}

for page in range(10):
    url = f'https://sputniknews.com/services/search/getmore/?query=Navalny&offset={page*20}&tags=0'
    resp = requests.get(url,headers=headers)
    html = BeautifulSoup(resp.text,'html.parser')

    links = []
    for link in html.find_all('a',{'class':"list__title"}):
        links.append('https://sputniknews.com'+link['href'])

Trouble finding all URLs from a webpage using BeautifulSoup

Related

Recent Posts