zhe800是一个淘宝导购网站,主打低价与折扣,写了一个分类采集的,结果保存到一个csv中,代码如下
#coding=utf-8import requestsimport reimport unicodecsvimport json#折800列表页,暂时不支持品牌团urls=[ 'http://www.zhe800.com/ju_tag/taodianqi',#数码家电 'http://www.zhe800.com/ju_tag/taomeishi',#美食]deal_re=re.compile(r'setDeal\(\{deals:(.*?)\],')def get(url): result=[] html=requests.get(url).text.replace("","").replace("","") #print html match=deal_re.search(html) if match: for item in json.loads(match.group(1)+']'): result.append([item['deal_title'],item['price'],item['list_price'],item['discount'],item['recommend_reason'],item['remainder_time'],item['deal_url']]) else: print u'找不到相应数据' return resultif __name__=="__main__": print u"开始工作" f=open('zhe800.csv','wb') w = unicodecsv.writer(f, encoding='gbk') w.writerow((u'标题', u'折扣价',u'原价',u'折扣比',u'推荐理由',u'剩余时间',u'购买链接')) for url in urls: print u"开始下载%s" %url for item in get(url): try: #剩余时间为0代表“已卖光” if item[5]: line=tuple(item) w.writerow(line) except Exception as e: print item print str(e) continue f.close() print u"结束工作"