使用python爬取一个省市城市列表

文章目录

前言

最近打算使用 ,但是目前没有数据来源,就想着搜搜有没有 json 的数据,结果搜了一下只有各种数据库的

然后搜了下有个网站说有很完整的数据的还是收费的

作为一个穷人程序员,如果是公司用,我付费买一个也无所谓,但是本身是想私人使用,甚至开源出去,付费就没必要了

这时候我想着,既然如此,我就爬一份数据,自己造一个吧

说到爬虫,我们就想起了明年..两开花 python

开发环境

使用的语言是 python3

request_html+基本库

爬取网页

查看网页

一级页面

使用 chrome dev 工具查看元素

20190125135659.png

呃. table 体系,最近几年算是比较少见了

分析了一下,整个页面只有备案号和省份名称是 a 标签,这下过滤一下备案号,剩下的不就是我们要的数据了吗

二级页面

点开北京,数据比较少,只有市辖区

20190125140053.png

内蒙的就比较多一点了

20190125140122.png

纯数字的是编码,其他的是名称,也是过滤掉 IPC 备案的就好

三级页面

和二级页面基本一致

20190125140255.png

撸码

city_get.py

  1import json
  2from requests_html import HTMLSession
  3import requests_html
  4
  5session = HTMLSession()
  6
  7
  8class Entity:
  9    name: str
 10    link: str
 11    no: str
 12    baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/"
 13
 14    def __str__(self) -> str:
 15        return "name:%s,link=%s" % (self.name, self.link)
 16
 17    def __eq__(self, o: object) -> bool:
 18        return self.link == o.link
 19
 20    def to_json(self) -> str:
 21        pass
 22
 23
 24class Province(Entity):
 25
 26    def __init__(self) -> None:
 27        super().__init__()
 28        self.cityList = []
 29
 30    def __str__(self) -> str:
 31        return "name:%s,link=%s" % (self.name, self.link)
 32
 33    def fetch_city_list(self):
 34        url = "%s%s" % (Entity.baseUrl, self.link)
 35        r = session.get(url)
 36        r.encoding = "gbk"
 37        h: requests_html.HTML = r.html
 38        li: list[requests_html.Element] = h.find("a")
 39        for a in li:
 40            text = a.text
 41            if text.__contains__("京ICP"):
 42                continue
 43            href_ = a.attrs["href"]
 44            city = City()
 45            city.link = href_
 46            city.province = self
 47            # print(text, href_)
 48
 49            try:
 50                index = self.cityList.index(city)
 51                city = self.cityList[index]
 52            except ValueError:
 53                self.cityList.append(city)
 54
 55            if text.isnumeric():
 56                city.no = text
 57            else:
 58                city.name = text
 59
 60        for city in self.cityList:
 61            city.fetch_county_list()
 62
 63    def to_json(self) -> str:
 64        pass
 65
 66
 67class City(Entity):
 68    province: Province
 69
 70    def __init__(self) -> None:
 71        super().__init__()
 72        self.countyList = []
 73
 74    def fetch_county_list(self):
 75        print("%s 开始" % self.name)
 76        url = "%s%s" % (Entity.baseUrl, self.link)
 77        r = session.get(url)
 78        r.encoding = "gbk"
 79        h: requests_html.HTML = r.html
 80        li: list[requests_html.Element] = h.find("a")
 81        for a in li:
 82            text = a.text
 83            if text.__contains__("京ICP"):
 84                continue
 85            href_ = a.attrs["href"]
 86            county = County()
 87            county.link = href_
 88            county.province = self
 89            # print(text, href_)
 90
 91            try:
 92                index = self.countyList.index(county)
 93                county = self.countyList[index]
 94            except ValueError:
 95                self.countyList.append(county)
 96
 97            if text.isnumeric():
 98                county.no = text
 99            else:
100                county.name = text
101
102        for county in self.countyList:
103            # print(county.__str__())
104            pass
105
106        print("%s 结束" % self.name)
107
108    pass
109
110
111class County(Entity):
112    city: City
113    pass
114
115
116provinceList = []
117
118
119def fetch_province_list():
120    response = session.get("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html")
121    response.encoding = "gbk"
122    html: requests_html.HTML = response.html
123    # s = response.content.decode("gbk")
124    l: list = html.find("a")
125    for a in l:
126        ae: requests_html.Element = a
127        href: str = ae.attrs.get("href")
128        if href.endswith("html"):
129            province = Province()
130            province.name = ae.text.lstrip()
131            province.link = href.lstrip()
132            provinceList.append(province)
133
134
135fetch_province_list()
136
137if __name__ == '__main__':
138    for p in provinceList:
139        if p.name == "黑龙江省":
140            p.fetch_city_list()
141# session.close()

dump_data.py

 1import json
 2
 3from city.city_get import Province, County, City, provinceList
 4import datetime
 5
 6# for p in provinceList:
 7#     pr: Province = p
 8version = 2
 9
10di = dict()
11di["version"] = version
12now = datetime.datetime.now()
13date = datetime.datetime.strftime(now, "%Y-%m-%d %H:%M:%S")
14di["date"] = date
15di["timeStamp"] = now.timestamp()
16
17proList = []
18
19
20def make_province(p: Province):
21    p.fetch_city_list()
22    p_dict = dict()
23    city_list = []
24
25    p_dict["name"] = p.name
26
27    for city in p.cityList:
28        city: City = city
29        c_dict = dict()
30        c_dict["name"] = city.name
31        c_dict["no"] = city.no
32        city_list.append(c_dict)
33        make_city(city, c_dict)
34
35    p_dict["cityList"] = city_list
36    proList.append(p_dict)
37
38
39def make_city(city: City, city_obj: dict):
40    city.fetch_county_list()
41    li = []
42    county_list: list[County] = city.countyList
43    for county in county_list:
44        c_obj = dict()
45        c_obj["name"] = county.name
46        c_obj["no"] = county.no
47        li.append(c_obj)
48
49    city_obj["countyList"] = li
50    pass
51
52
53for province in provinceList:
54    print("province = %s" % province.name)
55    make_province(province)
56
57di["provinceList"] = proList
58
59s = json.dumps(di)
60
61f = open("data/city-version-%s.json" % version, 'w')
62
63f.write(s)

分了两个文件,其中一个是获取数据,一个是将数据转为 json 形式保存

如果后续有必要,也可以弄一个数据库,具体是 sqlite 还是 mysql 都可以自己解析 json 插入,对于一个合格的程序员都是小意思

代码

代码可以从仓库查看

生成数据

生成的数据比较大,大概有 22w 字符 200 多 K

可以从下载

或直接从 copy

格式化完的数据有 14000 行左右, 可以查看