Я хочу очистить различные элементы веб-сайта, и у меня есть проблема в определенной части html-кода. Это html-код:
if ( window.tc_vars ) {
tc_vars["owner_sub_category"] = ["CENTRE_MULTIMARQUES"];
}
var SellerInformationData = {"customerType":"PRO","reference":"E111477039","account":{"familyCode":"CENTRE_MULTIMARQUES","family":"Centre multimarques","address":{"street1":"3 RUE DAGUERRE","city":"RAMBOUILLET","zipCode":"78120","country":"FRANCE","location":{"longitude":1.8492847,"latitude":48.6527683}},"publishedName":"LES GRANDES OCCASIONS.COM RAMBOUILLET","createdDate":"2022-05-30T17:00:03Z"},"companyCreationDate":"2003-04-01","rating":4.75,"feedbackCount":4,"customShowroomUrl":"les-grandes-occasions-rambouillet.espacevo.fr","classifiedsCount":27}
Мне нужен рейтинг и количество отзывов, и я пробую этот код
script_tag2 = soup.find('script', text=lambda text: text and 'var SellerInformationData' in text)
json_str2 = script_tag2.text.strip().replace('var SellerInformationData = ', '').replace(';', '')
datajsSID = json.loads(json_str2)
try:
city = datajsSID['account']['address']['city']
except:
city = pd.NA
но у меня есть эта ошибка, вы можете мне помочь?
Traceback (most recent call last):
File "/Users/matheoferrer/PycharmProjects/MemoireM2/main.py", line 257, in <module>
data = json.loads(json_str)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
🤔 А знаете ли вы, что...
Python поддерживает множество парадигм программирования, включая процедурное, объектно-ориентированное и функциональное программирование.
Ваша проблема в том, что ваш soup.find()
получает весь тег, а не только строку, которая активировала фильтр.
Например:
from bs4 import BeautifulSoup
html = """
<script>
if ( window.tc_vars ) {
tc_vars["owner_sub_category"] = ["CENTRE_MULTIMARQUES"];
}
var SellerInformationData = {"customerType":"PRO","reference":"E111477039","account":{"familyCode":"CENTRE_MULTIMARQUES","family":"Centre multimarques","address":{"street1":"3 RUE DAGUERRE","city":"RAMBOUILLET","zipCode":"78120","country":"FRANCE","location":{"longitude":1.8492847,"latitude":48.6527683}},"publishedName":"LES GRANDES OCCASIONS.COM RAMBOUILLET","createdDate":"2022-05-30T17:00:03Z"},"companyCreationDate":"2003-04-01","rating":4.75,"feedbackCount":4,"customShowroomUrl":"les-grandes-occasions-rambouillet.espacevo.fr","classifiedsCount":27}
</script>
"""
soup = BeautifulSoup(html, 'html.parser')
script_tag2 = soup.find('script', text=lambda text: text and 'var SellerInformationData' in text)
print(script_tag2)
Дам тебе:
if ( window.tc_vars ) {
tc_vars["owner_sub_category"] = ["CENTRE_MULTIMARQUES"];
}
var SellerInformationData = {"customerType":"PRO","reference":"E111477039","account":{"familyCode":"CENTRE_MULTIMARQUES","family":"Centre multimarques","address":{"street1":"3 RUE DAGUERRE","city":"RAMBOUILLET","zipCode":"78120","country":"FRANCE","location":{"longitude":1.8492847,"latitude":48.6527683}},"publishedName":"LES GRANDES OCCASIONS.COM RAMBOUILLET","createdDate":"2022-05-30T17:00:03Z"},"companyCreationDate":"2003-04-01","rating":4.75,"feedbackCount":4,"customShowroomUrl":"les-grandes-occasions-rambouillet.espacevo.fr","classifiedsCount":27}
Когда вы replace('var SellerInformationData = ', '')
, вы все еще остаетесь с утверждением if
.
if ( window.tc_vars ) {
tc_vars["owner_sub_category"] = ["CENTRE_MULTIMARQUES"]
}
{"customerType":"PRO","reference":"E111477039","account":{"familyCode":"CENTRE_MULTIMARQUES","family":"Centre multimarques","address":{"street1":"3 RUE DAGUERRE","city":"RAMBOUILLET","zipCode":"78120","country":"FRANCE","location":{"longitude":1.8492847,"latitude":48.6527683}},"publishedName":"LES GRANDES OCCASIONS.COM RAMBOUILLET","createdDate":"2022-05-30T17:00:03Z"},"companyCreationDate":"2003-04-01","rating":4.75,"feedbackCount":4,"customShowroomUrl":"les-grandes-occasions-rambouillet.espacevo.fr","classifiedsCount":27}
Что, конечно, недопустимо json.
Вам нужно выбрать эту строку либо чем-то вроде пакета re
, либо вручную, проверив строку этого блока скрипта. Пытаться:
import json
from bs4 import BeautifulSoup
html = """
<script>
if ( window.tc_vars ) {
tc_vars["owner_sub_category"] = ["CENTRE_MULTIMARQUES"];
}
var SellerInformationData = {"customerType":"PRO","reference":"E111477039","account":{"familyCode":"CENTRE_MULTIMARQUES","family":"Centre multimarques","address":{"street1":"3 RUE DAGUERRE","city":"RAMBOUILLET","zipCode":"78120","country":"FRANCE","location":{"longitude":1.8492847,"latitude":48.6527683}},"publishedName":"LES GRANDES OCCASIONS.COM RAMBOUILLET","createdDate":"2022-05-30T17:00:03Z"},"companyCreationDate":"2003-04-01","rating":4.75,"feedbackCount":4,"customShowroomUrl":"les-grandes-occasions-rambouillet.espacevo.fr","classifiedsCount":27}
</script>
"""
prefix = "var SellerInformationData = "
suffix = ";"
soup = BeautifulSoup(html, 'html.parser')
script_tag2 = soup.find('script', text=lambda text: text and prefix in text)
datajsSID = {}
for line in script_tag2.text.split("\n"):
line = line.strip()
if line.startswith(prefix):
line = line.removeprefix(prefix).removesuffix(suffix)
datajsSID = json.loads(line)
break
city = datajsSID.get("account",{}).get("address",{}).get("city")
print(f"City: {city}")
rating = datajsSID.get("rating")
print(f"Rating: {rating}")
Это должно дать вам:
City: RAMBOUILLET
Rating: 4.75