爬取南宁市所有的数据

分析思路

只是改变源码中的cookie中的信息,就可以使用

源码分析

# coding=utf-8

from lxml import etree
import requests
import json
import pymysql
import traceback
import time


class splider:

    def __init__(self):
        self.currentPage = 1

        # 这里的二级域名写死是 南宁(nn)
        self.listUrl = "http://cz.meituan.com/meishi/api/poi/getPoiList?"

        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
            "Cookie": "uuid=63b8688d549b46efabda.1522164432.1.0.0; __mta=141907213.1522164452181.1522164452181.1522164452181.1; ci=89; rvct=89; _lxsdk_cuid=16268133435c8-0c8d41a754c31a-3a614f0b-144000-16268133436c8; _lxsdk_s=16268133436-27a-974-cd1%7C%7C3"
         }

        # 打开数据库连接(ip/数据库用户名/登录密码/数据库名),注意这里的编码格式,否则和python2.7一直乱码
        self.db = pymysql.connect("localhost", "root", "123456", "meishi", charset='utf8')
        # 使用 cursor() 方法创建一个游标对象 cursor
        self.cursor = self.db.cursor()

        self.name_list=[]
        self.id_list = []


    def splider(self):

        # 获得所有的美食分类
        with open("list.txt", 'r', encoding='utf8') as f:
            for classify in f.readlines():
                self.id_list.append(classify.strip('\n').split(' ')[0])
                self.name_list.append(classify.strip('\n').split(' ')[1])

        # 分别访问每一个种类
        for cateId in range(len(self.id_list)):
            self.currentPage = 1
            self.city_splider(1, cateId)
            print("------" + self.id_list[cateId] + "")

    """
    爬取每一个城市的数据
    """

    def city_splider(self, currentPage, cateId):

        kw = {
            'cityName': '常州',
            'uuid': '63b8688d549b46efabda.1522164432.1.0.0',
            'cateId': self.id_list[cateId],
            'page': currentPage
        }

        response = requests.get(self.listUrl, params=kw, headers=self.headers)

        # print(response.url)

        # 处理json并且存储数据
        # 注意这里的编码格式utf - 8
        # time.sleep(1000)
        json_response = response.content.decode(encoding='utf-8')
        dict_json = json.loads(json_response)

        if len(dict_json['data']['poiInfos']) != 0: self.getShopInfo(dict_json, cateId)
        print(self.id_list[cateId] + "完成第" + str(currentPage) + "页的存储")

        # dict_json['data']['totalCounts'] 分类美食下的总店铺数
        if self.currentPage * 32 < dict_json['data']['totalCounts']:
            self.currentPage = self.currentPage + 1

            # 暂时写死
            self.city_splider(self.currentPage, cateId)

    '''
    获得店铺的详细信息:
    title;名字
    avgScore:评分
    allCommentNum:评论数
    address:地址
    avgPrice:价格
    '''

    def getShopInfo(self, dict_json, cateId):
        for info in dict_json['data']['poiInfos']:
            # self.saveData(info)
            # print info['title'].encode('utf-8').split('(')[0]
            self.saveData(info, cateId)

    def saveData(self, info, cateId):

        # 获得品牌的真正名字
        name_str = info['title']
        if "(" in name_str:
            name_str = name_str.split('(')[0]
        elif "(" in name_str:
            name_str = name_str.split('(')[0]
        elif "'" in name_str:
            name_str = name_str.strip("'")

        avgScore = info['avgScore']
        allCommentNum = info['allCommentNum']
        address = info['address'].encode('utf8').decode()
        avgPrice = info['avgPrice']

        if '(' in address:
            address = address.split('(')[0]
        elif "(" in address:
            address = address.split('(')[0]
        elif "'" in name_str:
            address = address.strip("'")
        else:
            address = address


        # print(title.decode())

        # if name_str==name_str:
        # sql="INSERT INTO xckc(title, name, avgScore, avgPrice, allCommentNum, address) VALUES (name_str,name_str,1,2,'q','d')"
        try:
            sql = """
                INSERT INTO cz( name, avgScore, avgPrice, allCommentNum, address,cateId) VALUES ('%s','%s','%s','%s','%s','%s')""" % (
                 name_str, avgScore, avgPrice, allCommentNum, address, self.name_list[cateId])
            # 使用 execute()  方法执行 SQL 查询
            # print(sql)
            self.cursor.execute(sql)

        except Exception:
            print(traceback.print_exc())

        finally:
            self.db.commit()

    def get_all_shop_count(self):
        brand_list = []
        with open("brand.txt", 'r') as f:

            for brand in f.readlines():
                brand_list.append(brand.strip('\n'))

        for brand in brand_list:
            self.getShopCount(brand)

    def getShopCount(self, q):
        # 店铺总数限制1000家
        kw = {
            'limit': '1000',
            'uuid': '8e5efbb68b0f42f89d2c.1521816215.1.0.0',
            'q': q
        }
        response = requests.get(self.shopCount, params=kw, headers=self.headers)

        json_response = response.content.decode(encoding='utf-8')
        dict_json = json.loads(json_response)
        self.saveShopTmp(dict_json, q)


    #  关闭数据库
    def close_db(self):
        self.db.close()
        self.cursor.close()


if __name__ == '__main__':
    s = requests.session()
    s.keep_alive = False
    requests.adapters.DEFAULT_RETRIES = 99999

    s = splider()
    s.splider()
    s.close_db()