python - BeautifulSoup-extrae json de JS
html html-parsing (1)
Estoy jugando con BeautilfulSoup y estoy buscando una forma de obtener una cadena json específica dentro de un elemento JS.
Aquí está el JS:
<script>window.pinball = window.pinball || [];
window.pinball.push([''add'', {"srp_cleanup":"inactive","book_visit":"inactive","my_visits":"inactive"}]);
window.Rent = window.Rent || {};
window.Rent.zutron = {"error_div":".js-generic-error","host":"rent","user_type":null,"zid":null,"origin":null,"provider":null};
window.Rent.book_visit = {"book_visit_host":"http://bookavisit.prod.services.rentpath.com"}
window.Rent.tagging = {"tealium":{"env":"prod","profile":"tealium.rent.com","account":"rentpath"}};
window.Rent.realm = "rent";
window.Rent.data = {"floorplans":{"1159255":{"availability":"1 Unit Available","availability_class":"floorplan-available-now","unitstyle":"aa1- 1 Bed/1 Bath","deposit":"","floorplan_id":1159255,"bed":"1 bed","listing_id":"571535","bath":"1 bath","sqft":"763 sqft","rent":"$1950 - $2322 /mo","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/52ad5930427b3e739676240c01b7d6cc/650-","fp3dfurnished":"http://image1.rent.com/imgr/07733fbd8c8a6a9134d5e0af77d52cb2/650-","floorplanimage":"http://image.rent.com/imgr/44c2395728fa733c2682506d96ec68f5/650-"},"1159257":{"availability":"2 Units Available","availability_class":"floorplan-available-now","unitstyle":"aa3- 1 Bed/1 Bath","deposit":"","floorplan_id":1159257,"bed":"1 bed","listing_id":"571535","bath":"1 bath","sqft":"893 sqft","rent":"$1995 - $2531 /mo","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/187753b2e7e6beb5aaf8602514361d89/650-","fp3dfurnished":"http://image.rent.com/imgr/55673aa4253387f0d06aa02495ccf2bc/650-","floorplanimage":"http://image.rent.com/imgr/389adb5ac1fa61c56aa04c88fe97c02f/650-"},"1159259":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"aa5- 1 Bed/1 Bath","deposit":"","floorplan_id":1159259,"bed":"1 bed","listing_id":"571535","bath":"1 bath","sqft":"899 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","floorplanimage":"http://image.rent.com/imgr/24059a4611740bd58436236758d65e20/650-"},"1159256":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"aa2- 1 Bed/1 Bath","deposit":"","floorplan_id":1159256,"bed":"1 bed","listing_id":"571535","bath":"1 bath","sqft":"880 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","floorplanimage":"http://image1.rent.com/imgr/0854a95e69c0b75ee0b13c41db2f31f1/650-"},"1159258":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"aa4- 1 Bed/1 Bath","deposit":"","floorplan_id":1159258,"bed":"1 bed","listing_id":"571535","bath":"1 bath","sqft":"897 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","floorplanimage":"http://image1.rent.com/imgr/deb3efc9ee3933a0a1b4844d886b7a8a/650-"},"1159262":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc3- 2 Bed/2 Bath","deposit":"","floorplan_id":1159262,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1194 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","floorplanimage":"http://image1.rent.com/imgr/a1fff6050e86f98b7249b843cd6f0836/650-"},"1159263":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc4- 2 Bed/2 Bath","deposit":"","floorplan_id":1159263,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1201 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image1.rent.com/imgr/33e2bb30c9aa1fcdbbf8ce4882a18fcd/650-","fp3dfurnished":"http://image.rent.com/imgr/c4d4df83e18f2b12c8cae6dab523769b/650-","floorplanimage":"http://image.rent.com/imgr/11ac88f52ca904e7646e03b6791f8455/650-"},"1159266":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc7- 2 Bed/2 Bath","deposit":"","floorplan_id":1159266,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1461 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/0a3887c07a7bc05670a826cd5562c49d/650-","fp3dfurnished":"http://image.rent.com/imgr/efa94735904b40ba463cbd26bc5504cf/650-","floorplanimage":"http://image1.rent.com/imgr/36413f72b93f0b0ed2f4f89337ef719d/650-"},"1159264":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc5- 2 Bed/2 Bath","deposit":"","floorplan_id":1159264,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1325 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","floorplanimage":"http://image.rent.com/imgr/ce1627742dbca97cc44d726b1d906fc3/650-"},"1159267":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bcl1-2 Bed/2 Bath","deposit":"","floorplan_id":1159267,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1500 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/a5888b34db510f6932af116e5197ce0c/650-","fp3dfurnished":"http://image1.rent.com/imgr/68f33736e29613562d9a5618eec1a4c6/650-","floorplanimage":"http://image1.rent.com/imgr/d7a833b56639b121178ddc86ac074754/650-"},"1159261":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc2- 2 Bed/2 Bath","deposit":"","floorplan_id":1159261,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1187 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/33e2bb30c9aa1fcdbbf8ce4882a18fcd/650-","fp3dfurnished":"http://image.rent.com/imgr/c4d4df83e18f2b12c8cae6dab523769b/650-","floorplanimage":"http://image1.rent.com/imgr/11ac88f52ca904e7646e03b6791f8455/650-"},"1159265":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc6- 2 Bed/2 Bath","deposit":"","floorplan_id":1159265,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1400 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/3f80d6e4386db5f450a6750c1a537b84/650-","fp3dfurnished":"http://image1.rent.com/imgr/f54aefd699a9ed3f1d8b6fb8e4ce1500/650-","floorplanimage":"http://image1.rent.com/imgr/b78bda34547615be4973da38dbd9a10f/650-"},"1159260":{"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc1- 2 Bed/2 Bath","deposit":"","floorplan_id":1159260,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1121 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image1.rent.com/imgr/3b4e4306d4cc2317bd271888532405a0/650-","fp3dfurnished":"http://image1.rent.com/imgr/8ca6a08b9c4eed76575520b4f1dcc03c/650-","floorplanimage":"http://image.rent.com/imgr/f25bcd28009d72a91f02d4e125340b65/650-"},"1159268":{"availability":"1 Unit Available","availability_class":"floorplan-available-now","unitstyle":"cdta1- 3 Bed/3 Bath Office TH","deposit":"","floorplan_id":1159268,"bed":"3 beds","listing_id":"571535","bath":"3 baths","sqft":"2100 sqft","rent":"$3798 - $5073 /mo","propertyname":"Reading Commons","fp3dunfurnished":"http://image1.rent.com/imgr/82ba57c2f1be5071c3d5f48a79c9d45e/650-","fp3dfurnished":"http://image.rent.com/imgr/bc7908ca722b6f9407a247ebf7af49bd/650-","floorplanimage":"http://image.rent.com/imgr/3c881fbe1aba5ba7be68ca6399e7daa3/650-"},"1159269":{"availability":"1 Unit Available","availability_class":"floorplan-available-now","unitstyle":"cdta2- 3 Bed/3 Bath Office TH","deposit":"","floorplan_id":1159269,"bed":"3 beds","listing_id":"571535","bath":"3 baths","sqft":"2310 sqft","rent":"$3908 - $4995 /mo","propertyname":"Reading Commons","fp3dunfurnished":"http://image1.rent.com/imgr/86b5248dfbaef2534218a8bdb724d93e/650-","fp3dfurnished":"http://image.rent.com/imgr/ee01414c664925a3463bad279f943363/650-","floorplanimage":"http://image.rent.com/imgr/ba58885223be2f4f8bfd1588d9ddca9e/650-"}},"reviews":{"startingrecordnumber":1,"totalnumberofmatchingrecords":18,"numberofrecordsreturned":10,"numberofpages":2,"endingrecordnumber":10,"pagenumber":1,"numberofrecordsperpage":10},"listing":{"id":"571535","name":"Reading Commons","address_full":"7 Archstone Circle, Reading, MA 01867","phone_desktop":"(781) 205-2341","visits_enabled":true}};
window.Rent.mapbox_api_key = "pk.eyJ1IjoibmhnbWFwYm94IiwiYSI6ImNpb2VrYW5uazAwbHp5OG0yYmp6bms5bjYifQ.4RylIPWDNDEie2NreUsbig";
window.Rent.asset_host = "rent.assets.rentpathcdn.com";
window.zutron_host = "http://zutron.primedia.com";
window.ONESEARCH_URL = "http://onesearch.svc.primedia.com";
window.Rent.pageType = "pdp";
// these two globals are used in onesearch.js, not sure where else
window.channel = "apartments";
window.APPLICATION = "rent";
window.googletag = window.googletag || {};
window.googletag.cmd = window.googletag.cmd || [];
// SID is used by the Moving Leads Service
window.Rent.MOVING_LEADS_SID = 96;</script>
Pude sacar el JS a través de BeautifulSoup y estoy buscando la cadena json correspondiente a la window.Rent.data
.
¿Hay alguna manera de hacer esto sin tener que recurrir a re
?
La idea es usar un patrón de expresión regular con un grupo de captura . Luego, use esta expresión regular para ubicar el elemento de script
por texto y luego para extraer la subcadena de un guión. Luego, puede usar json.loads()
para cargar la cadena JSON en un objeto Python:
import json
import re
from bs4 import BeautifulSoup
data = """
your HTML here"""
soup = BeautifulSoup(data, "html.parser")
pattern = re.compile(r"window.Rent.data/s+=/s+(/{.*?/});/n")
script = soup.find("script", text=pattern)
data = pattern.search(script.text).group(1)
data = json.loads(data)
print(data)
También hay otra forma, un analizador de JavaScript : he experimentado con slimit
en un par de veces , échale un vistazo.