Скрапинг html json в python

Я хочу очистить различные элементы веб-сайта, и у меня есть проблема в определенной части html-кода. Это html-код:

  if ( window.tc_vars ) {
    tc_vars["owner_sub_category"] = ["CENTRE_MULTIMARQUES"];
  }
  var SellerInformationData = {"customerType":"PRO","reference":"E111477039","account":{"familyCode":"CENTRE_MULTIMARQUES","family":"Centre multimarques","address":{"street1":"3 RUE DAGUERRE","city":"RAMBOUILLET","zipCode":"78120","country":"FRANCE","location":{"longitude":1.8492847,"latitude":48.6527683}},"publishedName":"LES GRANDES OCCASIONS.COM RAMBOUILLET","createdDate":"2022-05-30T17:00:03Z"},"companyCreationDate":"2003-04-01","rating":4.75,"feedbackCount":4,"customShowroomUrl":"les-grandes-occasions-rambouillet.espacevo.fr","classifiedsCount":27}

Мне нужен рейтинг и количество отзывов, и я пробую этот код

script_tag2 = soup.find('script', text=lambda text: text and 'var SellerInformationData' in text)
        json_str2 = script_tag2.text.strip().replace('var SellerInformationData = ', '').replace(';', '')
        datajsSID = json.loads(json_str2)

        try:
            city = datajsSID['account']['address']['city']
        except:
            city = pd.NA

но у меня есть эта ошибка, вы можете мне помочь?

Traceback (most recent call last):
  File "/Users/matheoferrer/PycharmProjects/MemoireM2/main.py", line 257, in <module>
    data = json.loads(json_str)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

🤔 А знаете ли вы, что...
Python поддерживает множество парадигм программирования, включая процедурное, объектно-ориентированное и функциональное программирование.


66
1

Ответ:

Решено

Ваша проблема в том, что ваш soup.find() получает весь тег, а не только строку, которая активировала фильтр.

Например:

from bs4 import BeautifulSoup

html = """
<script>
  if ( window.tc_vars ) {
    tc_vars["owner_sub_category"] = ["CENTRE_MULTIMARQUES"];
  }
  var SellerInformationData = {"customerType":"PRO","reference":"E111477039","account":{"familyCode":"CENTRE_MULTIMARQUES","family":"Centre multimarques","address":{"street1":"3 RUE DAGUERRE","city":"RAMBOUILLET","zipCode":"78120","country":"FRANCE","location":{"longitude":1.8492847,"latitude":48.6527683}},"publishedName":"LES GRANDES OCCASIONS.COM RAMBOUILLET","createdDate":"2022-05-30T17:00:03Z"},"companyCreationDate":"2003-04-01","rating":4.75,"feedbackCount":4,"customShowroomUrl":"les-grandes-occasions-rambouillet.espacevo.fr","classifiedsCount":27}
</script>
"""

soup = BeautifulSoup(html, 'html.parser')
script_tag2 = soup.find('script', text=lambda text: text and 'var SellerInformationData' in text)

print(script_tag2)

Дам тебе:

  if ( window.tc_vars ) {
    tc_vars["owner_sub_category"] = ["CENTRE_MULTIMARQUES"];
  }
  var SellerInformationData = {"customerType":"PRO","reference":"E111477039","account":{"familyCode":"CENTRE_MULTIMARQUES","family":"Centre multimarques","address":{"street1":"3 RUE DAGUERRE","city":"RAMBOUILLET","zipCode":"78120","country":"FRANCE","location":{"longitude":1.8492847,"latitude":48.6527683}},"publishedName":"LES GRANDES OCCASIONS.COM RAMBOUILLET","createdDate":"2022-05-30T17:00:03Z"},"companyCreationDate":"2003-04-01","rating":4.75,"feedbackCount":4,"customShowroomUrl":"les-grandes-occasions-rambouillet.espacevo.fr","classifiedsCount":27}

Когда вы replace('var SellerInformationData = ', ''), вы все еще остаетесь с утверждением if.

if ( window.tc_vars ) {
    tc_vars["owner_sub_category"] = ["CENTRE_MULTIMARQUES"]
  }
  {"customerType":"PRO","reference":"E111477039","account":{"familyCode":"CENTRE_MULTIMARQUES","family":"Centre multimarques","address":{"street1":"3 RUE DAGUERRE","city":"RAMBOUILLET","zipCode":"78120","country":"FRANCE","location":{"longitude":1.8492847,"latitude":48.6527683}},"publishedName":"LES GRANDES OCCASIONS.COM RAMBOUILLET","createdDate":"2022-05-30T17:00:03Z"},"companyCreationDate":"2003-04-01","rating":4.75,"feedbackCount":4,"customShowroomUrl":"les-grandes-occasions-rambouillet.espacevo.fr","classifiedsCount":27}

Что, конечно, недопустимо json.

Вам нужно выбрать эту строку либо чем-то вроде пакета re, либо вручную, проверив строку этого блока скрипта. Пытаться:

import json
from bs4 import BeautifulSoup

html = """
<script>
  if ( window.tc_vars ) {
    tc_vars["owner_sub_category"] = ["CENTRE_MULTIMARQUES"];
  }
  var SellerInformationData = {"customerType":"PRO","reference":"E111477039","account":{"familyCode":"CENTRE_MULTIMARQUES","family":"Centre multimarques","address":{"street1":"3 RUE DAGUERRE","city":"RAMBOUILLET","zipCode":"78120","country":"FRANCE","location":{"longitude":1.8492847,"latitude":48.6527683}},"publishedName":"LES GRANDES OCCASIONS.COM RAMBOUILLET","createdDate":"2022-05-30T17:00:03Z"},"companyCreationDate":"2003-04-01","rating":4.75,"feedbackCount":4,"customShowroomUrl":"les-grandes-occasions-rambouillet.espacevo.fr","classifiedsCount":27}
</script>
"""

prefix = "var SellerInformationData = "
suffix = ";"

soup = BeautifulSoup(html, 'html.parser')
script_tag2 = soup.find('script', text=lambda text: text and prefix in text)

datajsSID = {}
for line in script_tag2.text.split("\n"):
    line = line.strip()
    if line.startswith(prefix):
        line = line.removeprefix(prefix).removesuffix(suffix)
        datajsSID = json.loads(line)
        break

city = datajsSID.get("account",{}).get("address",{}).get("city")
print(f"City: {city}")

rating = datajsSID.get("rating")
print(f"Rating: {rating}")

Это должно дать вам:

City: RAMBOUILLET
Rating: 4.75