简约pyspider工作代码
index页采集urls判断
@config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): if re.match('http://www.yunjinet.com/sell/show.+',each.attr.href): self.crawl(each.attr.href, callback=self.detail_page) else: self.crawl(each.attr.href, callback=self.index_page)
index页采集urls循环
@config(age=10 * 24 * 60 * 60) def index_page(self, response): # 获取套图url for each in response.doc('.list_products_row_box .list_products_name > a').items(): self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False) next = response.doc('nav a').attr.href self.crawl(next, callback=self.index_page, validate_cert=False)
index页常规递增
from pyspider.libs.base_handler import * class Handler(BaseHandler): crawl_config = { } def __init__(self): self.base_url = 'https://mm.taobao.com/json/request_top_list.htm?page=' self.page_num = 1 self.total_num = 30 @every(minutes=24 * 60) def on_start(self): while self.page_num <= self.total_num: url = self.base_url + str(self.page_num) print url self.crawl(url, callback=self.index_page) self.page_num += 1 @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): self.crawl(each.attr.href, callback=self.detail_page) @config(priority=2) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), }
detail页采集urls循环
@config(priority=2) def detail_page(self, response): # 获取当前页面图片 for each in response.doc('.xzoom-container > img.img-responsive').items(): img_url = each.attr.src # 获取下一页url for each in response.doc('.pagenavi > a:last-child').items(): self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
导入数据表
### from pyspider.libs.base_handler import * import re from pyspider.database.mysql.mysqldb import SQL import html # 数据库访问配置 resultdb_config = { 'host': '127.0.0.1', 'username': 'root', 'password': 'root', 'database': 'pyspider', } #创建数据存储表 @every(minutes=24 * 60) def on_start(self): self.create_result_table() # 创建爬虫[结果数据存储表] def on_result(self, result): if not result or not result['v_metatags_title_1']: return sql = SQL(host=resultdb_config['host'], username=resultdb_config['username'], password=resultdb_config['password'], database=resultdb_config['database']) sql.insert(self.project_name + '_result', **result) # 创建爬虫[结果数据存储表] def create_result_table(self): sql = SQL(host=resultdb_config['host'], username=resultdb_config['username'], password=resultdb_config['password'], database=resultdb_config['database']) # 创建爬虫[结果数据存储表] new_tb_sql = """CREATE TABLE IF NOT EXISTS `""" + self.project_name + '_result' + """` ( `v_products_model` mediumtext NOT NULL, `v_products_type` int(2) DEFAULT '1', `v_products_image` mediumtext NOT NULL, `v_products_name_1` mediumtext NOT NULL, `v_products_description_1` mediumtext NOT NULL, `v_products_url_1` mediumtext NOT NULL, `v_specials_price` int(2) DEFAULT '0', `v_specials_date_avail` varchar(50) DEFAULT '0000-00-00 00:00:00', `v_specials_expires_date` varchar(50) DEFAULT '0000-00-00 00:00:00', `v_products_price` mediumtext NOT NULL, `v_products_weight` int(2) DEFAULT '0', `v_product_is_call` int(2) DEFAULT '0', `v_products_sort_order` int(2) DEFAULT '1', `v_products_quantity_order_min` int(2) DEFAULT '1', `v_products_quantity_order_units` int(2) DEFAULT '1', `v_products_priced_by_attribute` int(2) DEFAULT '0', `v_product_is_always_free_shipping` int(2) DEFAULT '1', `v_date_avail` varchar(50) DEFAULT '0000-00-00 00:00:00', `v_date_added` varchar(50) DEFAULT '0000-00-00 00:00:00', `v_products_quantity` int(10) DEFAULT '1000', `v_manufacturers_name` mediumtext NOT NULL, `v_categories_name_1` mediumtext NOT NULL, `v_tax_class_title` varchar(50) DEFAULT '--none--', `v_status` int(2) DEFAULT '1', `v_html_uri` mediumtext NOT NULL, `v_products_options_type` int(2) DEFAULT '0', `v_metatags_title_1` mediumtext NOT NULL, `v_metatags_keywords_1` mediumtext NOT NULL, `v_metatags_description_1` mediumtext NOT NULL, `v_products_options_name_1` mediumtext NOT NULL, `v_products_options_values_name_1` mediumtext NOT NULL, `v_products_options_name_2` mediumtext NOT NULL, `v_products_options_values_name_2` mediumtext NOT NULL ) ENGINE=MyISAM DEFAULT CHARSET=utf8""" try: sql_result = sql.execute(new_tb_sql) sql.disconnect() return sql_result is not False except Exception as e: print("[结果数据存储表]检查发生错误" + str(e)) return False

浙公网安备 33010602011771号