Getting Javascript Variable Value While Scraping With Python
I know this is asked before also, but I am a newbie in scraping and python. Please help me and it would be very much helpful in my learning path. I am scraping a news site using py
Solution 1:
html = '''<!-- Eliminate render-blocking JavaScript and CSS in above-the-fold content --><scriptsrc="https://ajax.googleapis.com/ajax/libs/jquery/1.8/jquery.min.js"></script><scriptsrc="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.0/js/materialize.min.js"></script><scripttype="text/javascript"src="/dist/scripts/index.js"></script><scripttype="text/javascript"src="/dist/scripts/read.js"></script><scriptsrc="/dist/scripts/jquery.scrolldepth.min.js"></script><scripttype="text/javascript">var min_news_id = "d7zlgjdu-1"; // line 1functionloadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'politics','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id; // line 2
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
</script>'''
finder = re.findall(r'min_news_id = .*;', html)
print(finder)
Output:
['min_news_id = "d7zlgjdu-1";', 'min_news_id = data.min_news_id||min_news_id;']
#2 OR YOU CAN USE
print(finder[0].replace('min_news_id = ', '').replace('"','').replace(';','').strip())
Output:
d7zlgjdu-1
#3 OR YOU CAN USE
finder = re.findall(r'[a-z0-9]{8}-[0-9]', html)
print(finder)
Output:
['d7zlgjdu-1']
Solution 2:
you can't monitor javascript variable change using BeautifulSoup
, here how to get next page news using while
loop, re
and json
from bs4 import BeautifulSoup
import requests, re
page_url = 'https://inshorts.com/en/read/politics'
ajax_url = 'https://inshorts.com/en/ajax/more_news'
htmlPage = requests.get(page_url).text
# BeautifulSoup extract article summary# page = BeautifulSoup(htmlPage, "html.parser")# ...# get current min_news_id
min_news_id = re.search('min_news_id\s+=\s+"([^"]+)', htmlPage).group(1) # result: d7zlgjdu-1
customHead = {'X-Requested-With': 'XMLHttpRequest', 'Referer': page_url}
while min_news_id:
# change "politics" if in different category
reqBody = {'category' : 'politics', 'news_offset' : min_news_id }
# get Ajax next page
ajax_response = requests.post(ajax_url, headers=customHead, data=reqBody).json() # parse string to json# again, do extract article summary
page = BeautifulSoup(ajax_response["html"], "html.parser")
# ....# ....# new min_news_id
min_news_id = ajax_response["min_news_id"]
# remove this to loop all page (thousand?)break
Solution 3:
thank you for the response, Finally I solved using requests
package after reading its documentation,
here is my code :
if InShortsScraper.firstLoad == True:
self.pattern = re.compile('var min_news_id = (.+?);')
else:
self.pattern = re.compile('min_news_id = (.+?);')
page = None# print("Pattern: " + str(self.pattern))if news_offset == None:
htmlPage = urlopen(url)
page = bs(htmlPage, "html.parser")
else:
self.loadMore['news_offset'] = InShortsScraper.newsOffset
# print("payload : " + str(self.loadMore))try:
r = myRequest.post(
url = url,
data = self.loadMore
)
except TypeError:
print("Error in loading")
InShortsScraper.newsOffset = r.json()["min_news_id"]
page = bs(r.json()["html"], "html.parser")
#print(page)if InShortsScraper.newsOffset == None:
scripts = page.find_all("script")
for script in scripts:
for line in script:
scriptString = str(line)
if"min_news_id"in scriptString:
finder = re.findall(self.pattern, scriptString)
InShortsScraper.newsOffset = finder[0].replace('min_news_id = ', '').replace('"','').replace(';','').strip()
Post a Comment for "Getting Javascript Variable Value While Scraping With Python"