Scrapy/Python: Processing values in a yield -
i trying write crawler using scrapy/python, reads values page.
i want crawler store highest , lowest values in seperate fields.
so far, able read values page (please see code below), not sure how calculate lowest , highest value , store in separate fields ?
for example, crawler reads page , returns these values
- burvale-score = 75.25
- richmond-score = 85.04
- somano-score = '' (value missing)
- tucson-score = 90.67
- cloud-score = 50.00
so want populate ....
- 'highestscore': 90.67
- 'lowestscore': 50.00
how do ? need use array ? put values in array , pick highest/lowest ?
also, please note there 2 yield
in code .... bottom yield
providing urls crawl, , first yield
crawl/collects values each url provided bottom yield
any appreciated. please provide code examples if can.
here code far .... storing -1, in case of missing values.
class myspider(basespider): name = "courses" start_urls = ['http://www.example.com/all-courses-listing'] allowed_domains = ["example.com"] def parse(self, response): hxs = selector(response) #for courses in response.xpath(response.body): courses in response.xpath("//meta"): yield { 'pagetype': courses.xpath('//meta[@name="pagetype"]/@content').extract_first(), 'pagefeatured': courses.xpath('//meta[@name="pagefeatured"]/@content').extract_first(), 'pagedate': courses.xpath('//meta[@name="pagedate"]/@content').extract_first(), 'pagebanner': courses.xpath('//meta[@name="pagebanner"]/@content').extract_first(), 'pagetitle': courses.xpath('//meta[@name="pagetitle"]/@content').extract_first(), 'pageurl': courses.xpath('//meta[@name="pageurl"]/@content').extract_first(), 'pagedescription': courses.xpath('//meta[@name="pagedescription"]/@content').extract_first(), 'pageid': courses.xpath('//meta[@name="pageid"]/@content').extract_first(), 'courseatarburvale': float(courses.xpath('//meta[@name="courseatar-burvale"]/@content').extract_first('').strip() or -1), 'courseatarrichmond': float(courses.xpath('//meta[@name="courseatar-richmond"]/@content').extract_first('').strip() or -1), 'courseatarsomano': float(courses.xpath('//meta[@name="courseatar-somano"]/@content').extract_first('').strip() or -1), 'courseatartucson': float(courses.xpath('//meta[@name="courseatar-tucson"]/@content').extract_first('').strip() or -1), 'courseatarcloud': float(courses.xpath('//meta[@name="courseatar-cloud"]/@content').extract_first('').strip() or -1), 'highestscore'; ?????? 'lowestscore'; ?????? } url in hxs.xpath('//ul[@class="scrapy"]/li/a/@href').extract(): yield request(response.urljoin(url), callback=self.parse)
i break down part of code:
yield { 'pagetype': courses.xpath('//meta[@name="pagetype"]/@content').extract_first(), 'pagefeatured': courses.xpath('//meta[@name="pagefeatured"]/@content').extract_first(), 'pagedate': courses.xpath('//meta[@name="pagedate"]/@content').extract_first(), 'pagebanner': courses.xpath('//meta[@name="pagebanner"]/@content').extract_first(), 'pagetitle': courses.xpath('//meta[@name="pagetitle"]/@content').extract_first(), 'pageurl': courses.xpath('//meta[@name="pageurl"]/@content').extract_first(), 'pagedescription': courses.xpath('//meta[@name="pagedescription"]/@content').extract_first(), 'pageid': courses.xpath('//meta[@name="pageid"]/@content').extract_first(), 'courseatarburvale': float(courses.xpath('//meta[@name="courseatar-burvale"]/@content').extract_first('').strip() or -1), 'courseatarrichmond': float(courses.xpath('//meta[@name="courseatar-richmond"]/@content').extract_first('').strip() or -1), 'courseatarsomano': float(courses.xpath('//meta[@name="courseatar-somano"]/@content').extract_first('').strip() or -1), 'courseatartucson': float(courses.xpath('//meta[@name="courseatar-tucson"]/@content').extract_first('').strip() or -1), 'courseatarcloud': float(courses.xpath('//meta[@name="courseatar-cloud"]/@content').extract_first('').strip() or -1), 'highestscore'; ?????? 'lowestscore'; ?????? }
into this:
item = { 'pagetype': courses.xpath('//meta[@name="pagetype"]/@content').extract_first(), 'pagefeatured': courses.xpath('//meta[@name="pagefeatured"]/@content').extract_first(), 'pagedate': courses.xpath('//meta[@name="pagedate"]/@content').extract_first(), 'pagebanner': courses.xpath('//meta[@name="pagebanner"]/@content').extract_first(), 'pagetitle': courses.xpath('//meta[@name="pagetitle"]/@content').extract_first(), 'pageurl': courses.xpath('//meta[@name="pageurl"]/@content').extract_first(), 'pagedescription': courses.xpath('//meta[@name="pagedescription"]/@content').extract_first(), 'pageid': courses.xpath('//meta[@name="pageid"]/@content').extract_first(), } scores = { 'courseatarburvale': float(courses.xpath('//meta[@name="courseatar-burvale"]/@content').extract_first('').strip() or -1), 'courseatarrichmond': float(courses.xpath('//meta[@name="courseatar-richmond"]/@content').extract_first('').strip() or -1), 'courseatarsomano': float(courses.xpath('//meta[@name="courseatar-somano"]/@content').extract_first('').strip() or -1), 'courseatartucson': float(courses.xpath('//meta[@name="courseatar-tucson"]/@content').extract_first('').strip() or -1), 'courseatarcloud': float(courses.xpath('//meta[@name="courseatar-cloud"]/@content').extract_first('').strip() or -1), } values = sorted(x x in scores.values() if x > 0) scores.update({ 'highestscore': values[-1], 'lowestscore': values[0], }) item.update(scores) yield item
Comments
Post a Comment