问题描述:

I have a webpage that contains several javascript elements. I only want to access one, named SOURCE.pdp.propertyJSON, and access the attributes in a PYTHONIC manner.

An edited (for the sake of readability) version of the HTML sourcecode is below; following is my python code.

Any pointers would be greatly appreciated!

<script type="text/javascript">

SOURCE = SOURCE || {};

SOURCE.pdp = SOURCE.pdp || {};

SOURCE.pdp.propertyJSON = {

"neighborhood": "Westwood",

"neighborhoodId": 7187,

"zipCode": "90024",

"city": "Los Angeles",

"county": "Los Angeles",

"countyFIPS": "06037",

"stateCode": "CA",

"stateName": "California",

"type": "CONDO",

"typeDisplay": "Condo",

"numBedrooms": "2",

"numBathrooms": 2,

"numFullBathrooms": 2,

"numBeds": 2,

"indexSource": "Assessor",

"isForeclosure": false,

"isOpaqueBAL": false,

"foreclosureStatus": "",

"isSrpFeatured": false,

"price": null,

"sqft": 1321,

"formattedBedAndBath": "2bd, 2 full ba",

"formattedSqft": "1,321 sqft"

}

pdp_location_data = {

"neighborhood": {

"locationId": "87308",

"name": "Westwood",

"locationType": "neighborhood",

"altId": "7187"

},

"state": {

"locationId": "5",

"name": "California",

"locationType": "state",

"altId": "CA"

},

"county": {

"locationId": "57",

"name": "Los Angeles County",

"locationType": "county",

"altId": "06037"

},

"city": {

"locationId": "22637",

"name": "Los Angeles",

"locationType": "city",

"altId": "4396"

},

"zipCode": {

"locationId": "76090",

"name": "90024",

"locationType": "zipCode",

"altName": "90024",

"altId": "90024"

}

};

SOURCE.pdp.isCountySupportsValuation = true;

SOURCE.pdp.isInHighDemandRegion = false;

var _SPANLONG = pdp_location_data.longitude;

var _SPANLAT = pdp_location_data.latitude;

var _CENLONG = pdp_location_data.longitude;

var _CENLAT = pdp_location_data.latitude;

</script>

Beware the ugly python!

 from bs4 import BeautifulSoup as bsoup

import requests as rq

url = 'https://www.SOURCE.com'

source_code = rq.get(url).text

soupcon = bsoup(source_code,"html.parser")

souper = soupcon.find_all('script', {'type': 'text/javascript'})

for line in souper:

if format(line).find('SOURCE.pdp.propertyJSON') != -1:

parts = format(line).split(',')

for var in parts:

if var.find('zipCode') != -1:

zipCode = var.split(':')[1].strip('"')

elif var.find('numBathrooms') != -1:

numBathrooms = var.split(':')[1].strip('"')

As you can see, I am currently accessing the JS object I want by finding all script elements that are of the type text/javascript, iterating through them to find the script that contains the object that I want, then splitting the entire script by the JS separator ',', and identifying elements of the JS object by searching through them for my key words. Not an ideal solution.

网友答案:

You can load the data as a dict using json.loads:

from bs4 import BeautifulSoup as bsoup
import re
from json import loads
source = """<script type="text/javascript">  SOURCE = SOURCE || {};
  SOURCE.pdp = SOURCE.pdp || {};
  SOURCE.pdp.propertyJSON = {    "neighborhood": "Westwood",    "neighborhoodId": 7187,    "zipCode": "90024",    "city": "Los Angeles",    "county": "Los Angeles",    "countyFIPS": "06037",    "stateCode": "CA",    "stateName": "California",    "type": "CONDO",    "typeDisplay": "Condo",    "numBedrooms": "2",    "numBathrooms": 2,    "numFullBathrooms": 2,    "numBeds": 2,    "indexSource": "Assessor",    "isForeclosure": false,    "isOpaqueBAL": false,    "foreclosureStatus": "",    "isSrpFeatured": false,    "price": null,    "sqft": 1321,    "formattedBedAndBath": "2bd, 2 full ba",    "formattedSqft": "1,321 sqft"  }  pdp_location_data = {    "neighborhood": {      "locationId": "87308",      "name": "Westwood",      "locationType": "neighborhood",      "altId": "7187"    },    "state": {      "locationId": "5",      "name": "California",      "locationType": "state",      "altId": "CA"    },    "county": {      "locationId": "57",      "name": "Los Angeles County",      "locationType": "county",      "altId": "06037"    },    "city": {      "locationId": "22637",      "name": "Los Angeles",      "locationType": "city",      "altId": "4396"    },    "zipCode": {      "locationId": "76090",      "name": "90024",      "locationType": "zipCode",      "altName": "90024",      "altId": "90024"    }  };
   SOURCE.pdp.isCountySupportsValuation = true;
    SOURCE.pdp.isInHighDemandRegion = false;
    var _SPANLONG = pdp_location_data.longitude;
    var _SPANLAT = pdp_location_data.latitude;
    var _CENLONG = pdp_location_data.longitude;
   var _CENLAT = pdp_location_data.latitude;  </script>"""
soup = bsoup(source,"html.parser")


json_re = re.compile("SOURCE\.pdp\.propertyJSON\s+=\s+(\{.*\})\s+pdp_location_data")
scr = soup.find("script", text=re.compile("SOURCE.pdp.propertyJSON")).text
js_raw = json_re.search(scr).group(1)
json_dict = loads(js_raw)

Which would give you:

{u'numBeds': 2, u'neighborhood': u'Westwood', u'stateName': u'California', u'numFullBathrooms': 2, u'indexSource': u'Assessor', u'countyFIPS': u'06037', u'city': u'Los Angeles', u'isSrpFeatured': False, u'type': u'CONDO', u'formattedSqft': u'1,321 sqft', u'isOpaqueBAL': False, u'price': None, u'zipCode': u'90024', u'numBedrooms': u'2', u'neighborhoodId': 7187, u'county': u'Los Angeles', u'formattedBedAndBath': u'2bd, 2 full ba', u'sqft': 1321, u'numBathrooms': 2, u'stateCode': u'CA', u'isForeclosure': False, u'typeDisplay': u'Condo', u'foreclosureStatus': u''}

If you want the pdp_location_data json just apply the exact same logic.

相关阅读:
Top