Scraping data from multiple tooltips using Python and Selenium
Looks like all your code needed were some WebdriverWaits. React-based websites are a bit difficult on automating due to a lot of aysncs if I am not wrong and due to the virtual DOM. I have refactored your code with WebdriverWaits as required (and also eliminated multiple lines, although you may retain them if you want a better readability). Here is the code:
from selenium import webdriver
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Edge(EdgeChromiumDriverManager(log_level=20).install())
driver.maximize_window()
driver.get("https://marathonlosangelesrefineryfencelinemonitoring.com/index.html")
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='nav-link-text']"))).click()
# Navigate to monitors
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//i[@class='arrow-down parameter-arrow']"))).click()
# Navigate to dropdown button
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//ul[@class='dropdown-menu' and @role='menu' and @aria-labelledby='ParameterDropdown']//li[12]"))).click()
# Select Hydrogen Sulfide and click
WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")))
driver.find_element_by_css_selector(".arrow-down.date-arrow").click()
req_month = 'Aug'
req_year = '2021'
req_timeline = req_month + " " + req_year
print(f"Timeline Selected is: {req_timeline}")
for i in range(11):
month = driver.find_element(By.XPATH, "//th[@class='month']").text
if month == req_timeline:
break
else:
driver.find_element(By.XPATH, "//th[@class='prev available']").click()
driver.find_element(By.XPATH, "//*[@class='table-condensed']//td[text()='1']").click()
driver.find_element(By.XPATH, "//*[text()='Apply']").click()
time.sleep(8)
res = []
test = driver.find_elements_by_xpath("//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")
for ele in test:
hover = ActionChains(driver).move_to_element(ele)
hover.perform()
time.sleep(1)
try:
site_id = driver.find_element_by_css_selector(".LAR-tooltip-site-id > p")
site_name = driver.find_element_by_css_selector(".LAR-tooltip-site-name")
date = driver.find_element_by_css_selector(".LAR-tooltip-localtime")
value = driver.find_element_by_css_selector(".LAR-tooltip-data-value")
unit = driver.find_element_by_css_selector(".LAR-tooltip-data-unit")
para_mdl = driver.find_element_by_css_selector(".tooltip-parameter-mdl")
res.append((site_id.text, site_name.text, date.text, value.text, unit.text, para_mdl.text))
except:
pass
print(res)
Here is the result:
Timeline Selected is: Aug 2021
[('F', 'Point Monitor', '7:55 AM', '1.80', 'ppb', 'MDL: 0.40 ppb'), ('B', 'Point Monitor', '7:55 AM', '1.20', 'ppb', 'MDL: 0.40 ppb'), ('E', 'Point Monitor', '7:55 AM', '1.10', 'ppb', 'MDL: 0.40 ppb'), ('A', 'Point Monitor', '7:55 AM', '0.40', 'ppb', 'MDL: 0.40 ppb')]
Process finished with exit code 0
You see even there are WebdriverWaits introduced, some places needed that hard stop on time.sleep
, otherwise the tests are getting flaky.
@ThaiNguyen, adding another answer to preserve the earlier one. I tried some crude ways to get the work done, and I succeeded after a lot of attempts, but I would say take it with a pinch of salt, as I iterated only for 3 dates in Aug. The refactored code is pasted below, but before you could see the code, let me explain you what I faced, which you could flag to take care. I had to add a lot of sleeps in order for the DOM to settle down for each action (and as you know, time.sleep is pretty unreliable when it comes to async), but I think even after waits I see that the code fails to stale elements, and adding time helped me take care of them (temporarily). Another thing - which, per me, is a big concern: Even though this code succeeded in fetching the results, I cannot assure you that it would do so for all the dates in Aug (let alone for all the required months), for the code behaves pretty flaky with the rendered DOM, and I don't want to blame the code at this point in time (with the limited knowledge that I have on selenium), but the DOM has heavy async if I am not wrong. So, I would want to say that with this code, you cannot hope to get everything at one shot; rather, you may have to invest your time in either refactoring the code and improving it or fetching data in chunks by running multiple times for a few dates at a time for each month, which is very frustrating, given the flakiness it owes to.
from selenium import webdriver
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Edge(EdgeChromiumDriverManager(log_level=20).install())
driver.maximize_window()
def h2s_selection():
driver.get("https://marathonlosangelesrefineryfencelinemonitoring.com/index.html")
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='nav-link-text']"))).click()
# Navigate to monitors
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//i[@class='arrow-down parameter-arrow']"))).click()
# Navigate to dropdown button
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//ul[@class='dropdown-menu' and @role='menu' and @aria-labelledby='ParameterDropdown']//li[12]"))).click()
# Select Hydrogen Sulfide and click
WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")))
def aug_date():
driver.find_element_by_css_selector(".arrow-down.date-arrow").click()
req_month = 'Aug'
req_year = '2021'
req_timeline = req_month + " " + req_year
print(f"Timeline Selected is: {req_timeline}")
for i in range(11):
month = driver.find_element(By.XPATH, "//th[@class='month']").text
if month == req_timeline:
break
else:
driver.find_element(By.XPATH, "//th[@class='prev available']").click()
dt = ['1', '2', '3']
for i in dt:
time.sleep(5)
each_date = driver.find_element(By.XPATH, "//*[@class='table-condensed']//td[text()=" + i + ']')
print(f"Date is {each_date.text}")
each_date.click()
driver.find_element(By.XPATH, "//*[text()='Apply']").click()
time.sleep(10)
tooltips()
time.sleep(5)
driver.find_element_by_css_selector(".arrow-down.date-arrow").click()
def tooltips():
# time.sleep(8)
res = []
test = driver.find_elements_by_xpath("//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")
for ele in test:
hover = ActionChains(driver).move_to_element(ele)
hover.perform()
time.sleep(1)
try:
site_id = driver.find_element_by_css_selector(".LAR-tooltip-site-id > p")
site_name = driver.find_element_by_css_selector(".LAR-tooltip-site-name")
date = driver.find_element_by_css_selector(".LAR-tooltip-localtime")
value = driver.find_element_by_css_selector(".LAR-tooltip-data-value")
unit = driver.find_element_by_css_selector(".LAR-tooltip-data-unit")
para_mdl = driver.find_element_by_css_selector(".tooltip-parameter-mdl")
res.append((site_id.text, site_name.text, date.text, value.text, unit.text, para_mdl.text))
except:
pass
print(res)
if __name__ == "__main__":
h2s_selection()
aug_date()
Output:
Timeline Selected is: Aug 2021
Date is 1
[('F', 'Point Monitor', '10:55 AM', '0.90', 'ppb', 'MDL: 0.40 ppb'), ('B', 'Point Monitor', '10:55 AM', '1.20', 'ppb', 'MDL: 0.40 ppb'), ('E', 'Point Monitor', '10:55 AM', '1.30', 'ppb', 'MDL: 0.40 ppb'), ('A', 'Point Monitor', '10:55 AM', '0.60', 'ppb', 'MDL: 0.40 ppb')]
Date is 2
[('B', 'Point Monitor', '10:25 PM', '1.70', 'ppb', 'MDL: 0.40 ppb'), ('E', 'Point Monitor', '10:25 PM', '1.90', 'ppb', 'MDL: 0.40 ppb')]
Date is 3
[('F', 'Point Monitor', '9:55 AM', '1.20', 'ppb', 'MDL: 0.40 ppb'), ('B', 'Point Monitor', '9:55 AM', '1.20', 'ppb', 'MDL: 0.40 ppb'), ('E', 'Point Monitor', '9:55 AM', '1.90', 'ppb', 'MDL: 0.40 ppb'), ('A', 'Point Monitor', '9:55 AM', '0.50', 'ppb', 'MDL: 0.40 ppb')]
Process finished with exit code 0