Я пытаюсь очистить артикул и описание на этом сайте: https://www.milwaukeetool.com/products/power-tools/drilling/drill-drivers
но он не будет очищать нужные элементы, несмотря на то, что код может быть запущен. Кто-нибудь знает, почему? похоже, я беру правильные элементы. Я пробовал использовать как запросы, так и селен (как показано ниже), и продолжаю получать тот же результат.
Метод запросов:
import requests
import pandas as pd
from bs4 import BeautifulSoup
link = 'https://www.milwaukeetool.com/products/power-tools/drilling/drill-drivers'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
}
res = requests.get(link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
df = pd.DataFrame(columns= ['sku','desc'])
for item in soup.select("#MTBody > main > section > div.product-listing-main.pt-5.md\:pt-\[30px\] > section:nth-child(1) > div > div > div > div:nth-child(2) > div > a > div.result-title__wrap.absolute.inset-0.top-auto.bg-gray-300.pt-\[5px\].md\:pt-2.px-1.md\:px-4.w-full.text-gray-800.text-center.h-\[75px\]"):
sku = item.select_one("#MTBody > main > section > div.product-listing-main.pt-5.md\:pt-\[30px\] > section:nth-child(1) > div > div > div > div:nth-child(2) > div > a > div.result-title__wrap.absolute.inset-0.top-auto.bg-gray-300.pt-\[5px\].md\:pt-2.px-1.md\:px-4.w-full.text-gray-800.text-center.h-\[75px\] > span").get_text(strip=True)
desc = item.select_one("#MTBody > main > section > div.product-listing-main.pt-5.md\:pt-\[30px\] > section:nth-child(1) > div > div > div > div:nth-child(2) > div > a > div.result-title__wrap.absolute.inset-0.top-auto.bg-gray-300.pt-\[5px\].md\:pt-2.px-1.md\:px-4.w-full.text-gray-800.text-center.h-\[75px\] > div.text-brandBlack.font-helvetica67.text-14.result-title.leading-none.max-h-8.overflow-hidden").get_text(strip=True)
df = pd.concat([df, pd.DataFrame({'sku': [sku], 'desc': [desc]})], ignore_index=True)
print(sku,desc)
df.to_csv("milwaukee.csv",index=False)
Селеновый метод:
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException
options = Options()
driver = uc.Chrome()
website = 'https://www.milwaukeetool.com/products/power-tools/drilling/drill-drivers'
driver.get(website)
product_list = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".product-listing-main.pt-5.md\\:pt-\\[30px\\]")))
prod_num = []
prod_desc = []
for container in product_list:
sku = container.find_element(By.CSS_SELECTOR, '.font-helvetica67.tracking-normal.uppercase.text-gray-900.text-12.result-sku.leading-none').text
description = container.find_element(By.CSS_SELECTOR, '.text-brandBlack.font-helvetica67.text-14.result-title.leading-none.max-h-8.overflow-hidden').text
prod_num.append(sku)
prod_desc.append(description)
for _ in range(4):
driver.execute_script("window.scrollBy(0, 2000);")
time.sleep(2)
driver.quit()
print(len(prod_num))
print(len(prod_desc))
# Create a DataFrame from the scraped data
df = pd.DataFrame({'code': prod_num, 'description': prod_desc})
# Save the DataFrame to a CSV file
df.to_csv('milwtest1.csv', index=False)
print(df)
🤔 А знаете ли вы, что...
В Python есть инструменты для тестирования кода, такие как библиотека unittest.
Пожалуйста, обратитесь к loc (селектор CSS или XPath) в следующем шаблоне:
<actions>
<action_goto url = "https://www.milwaukeetool.com/products/power-tools/drilling/drill-drivers" />
<action_loopineles>
<element loc = "div.product-listing__results-list > div > div a" />
<action_extract tabname = "dat_00000000000012ab">
<column_element colname = "c01" nickname = "sku">
<element loc = "span.font-helvetica67" />
<!-- <element loc = ".//span[contains(@class, 'font-helvetica67')]" /> -->
<transform>
<fun_replace substr = "(" newstr = "" />
<fun_replace substr = ")" newstr = "" />
</transform>
</column_element>
<column_element colname = "c02" nickname = "description">
<element loc = "div.result-title__wrap > div.text-brandBlack" />
</column_element>
</action_extract>
</action_loopineles>
</actions>
Извлеченные данные:
Или вы можете извлечь данные из ответа на запрос:
Страница динамически отображается, поэтому использование одного requests
не сработает.
За сайтом есть API, но для получения данных он использует идентификатор категории. Итак, мой подход будет состоять из двух частей:
import re
import requests
from pandas import DataFrame
URL = "https://www.milwaukeetool.com/products/power-tools/drilling/drill-drivers"
# GET CATEGORY ID -------------------------------------------------------------
response = requests.get(URL)
pattern = r'\\\"page_id\\\":\\\"{(\w{8}-\w{4}-\w{4}-\w{4}-\w{12})}\\\"'
category_id = re.search(pattern, response.text).groups()[0]
# PULL CATEGORY FROM API ------------------------------------------------------
data = {
'language': 'en',
'returnAll': True,
'categories': category_id,
}
response = requests.post('https://www.milwaukeetool.com/api/v1/products/listing', json=data)
results = response.json()["data"]["results"]
products = []
for result in results:
products.append({
"sku": result["sku"],
"description": result["title"],
})
products = DataFrame(products)
print(products)
Результаты выглядят следующим образом:
sku description
0 2905-20 M18 FUEL™ ½” Drill/Driver w/ ONE-KEY™
1 2903-20 M18 FUEL™ 1/2" Drill/Driver
2 3403-20 M12 FUEL™ 1/2" Drill/Driver
3 2505-20 M12 FUEL™ Installation Drill/Driver (Tool-Only)
4 2803-20 M18 FUEL™ 1/2" Drill Driver (Tool Only)
5 2804-20 M18 FUEL™ ½” Hammer Drill/Driver (Tool Only)
6 2503-20 M12 FUEL™ 1/2" Drill Driver (Tool Only)
7 2810-20 M18 FUEL™ Mud Mixer with 180° Handle (Tool Only)
8 3602-20 M18™ Compact Brushless 1/2" Hammer Drill/Driver
9 3601-20 M18™ Compact Brushless 1/2" Drill/ Driver
10 2902-20 M18 1/2" Brushless Hammer Drill (Tool Only)
11 2801-20 M18 Compact Brushless 1/2" Drill Driver Bare Tool
12 2407-20 M12™ 3/8” Drill/Driver (Tool Only)
13 2606-20 M18™ 1/2" Drill Driver
14 2615-20 M18™ Right Angle Drill
15 0726-20 M28™ Cordless 1/2" Hammer Drill (Tool Only)
16 1107-6 1/2 D-Handle Drill 0-500 RPM
17 1610-1 1/2" Compact Drill 650 RPM
18 0244-1 1/2" Magnum® Drill, 0-700 RPM
19 0101-20 1/4" Magnum® Drill, 0-4000 RPM with QUIK-LOK® ...
20 0302-20 1/2" Magnum® Drill, 0-850 RPM with All Metal ...
21 0233-20 3/8" Magnum® Drill, 0-2800 RPM with Keyless Chuck
22 0234-6 1/2" Magnum® Drill, 0-950 RPM
23 1660-6 1/2" Compact Drill 450 RPM
24 0100-20 1/4" Magnum® Drill, 0-2500 RPM with QUIK-LOK® ...
25 1101-1 1/2 D-Handle Drill 500 RPM
26 0200-20 3/8" Magnum® Drill, 0-1200 RPM
27 0240-20 3/8" Drill
28 1630-1 1/2" Compact Drill 900 RPM
29 1007-1 1/2 D-Handle Drill 0-600 RPM
30 0370-20 3/8" Close Quarter Angle Drill
31 0299-20 1/2" Magnum® Drill, 0-850 RPM
32 0201-20 3/8" Magnum® Drill, 0-2500 RPM with All Metal ...
33 1001-1 1/2 in. D-Handle Drill 0-600 RPM
34 0202-20 3/8" Magnum® Drill, 0-1200 RPM with All Metal...
35 0235-21 1/2" Magnum® Drill, 0-950 RPM with All Metal...
36 0300-20 1/2" Magnum® Drill, 0-850 RPM