使用python爬取一个省市城市列表
文章目录
前言
最近打算使用 flutter 做一个省市的级联列表 ,但是目前没有数据来源,就想着搜搜有没有 json 的数据,结果搜了一下只有各种数据库的
然后搜了下有个网站说有很完整的数据的还是收费的
作为一个穷人程序员,如果是公司用,我付费买一个也无所谓,但是本身是想私人使用,甚至开源出去,付费就没必要了
这时候我想着,既然如此,我就爬一份数据,自己造一个吧
说到爬虫,我们就想起了明年..两开花 python
开发环境
使用的语言是 python3
request_html+基本库
爬取网页
国家统计局-2017 年统计用区划代码和城乡划分代码(截止 2017 年 10 月 31 日)
查看网页
一级页面
使用 chrome dev 工具查看元素
呃. table 体系,最近几年算是比较少见了
分析了一下,整个页面只有备案号和省份名称是 a 标签,这下过滤一下备案号,剩下的不就是我们要的数据了吗
二级页面
点开北京,数据比较少,只有市辖区
内蒙的就比较多一点了
纯数字的是编码,其他的是名称,也是过滤掉 IPC 备案的就好
三级页面
和二级页面基本一致
撸码
city_get.py
1import json
2from requests_html import HTMLSession
3import requests_html
4
5session = HTMLSession()
6
7
8class Entity:
9 name: str
10 link: str
11 no: str
12 baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/"
13
14 def __str__(self) -> str:
15 return "name:%s,link=%s" % (self.name, self.link)
16
17 def __eq__(self, o: object) -> bool:
18 return self.link == o.link
19
20 def to_json(self) -> str:
21 pass
22
23
24class Province(Entity):
25
26 def __init__(self) -> None:
27 super().__init__()
28 self.cityList = []
29
30 def __str__(self) -> str:
31 return "name:%s,link=%s" % (self.name, self.link)
32
33 def fetch_city_list(self):
34 url = "%s%s" % (Entity.baseUrl, self.link)
35 r = session.get(url)
36 r.encoding = "gbk"
37 h: requests_html.HTML = r.html
38 li: list[requests_html.Element] = h.find("a")
39 for a in li:
40 text = a.text
41 if text.__contains__("京ICP"):
42 continue
43 href_ = a.attrs["href"]
44 city = City()
45 city.link = href_
46 city.province = self
47 # print(text, href_)
48
49 try:
50 index = self.cityList.index(city)
51 city = self.cityList[index]
52 except ValueError:
53 self.cityList.append(city)
54
55 if text.isnumeric():
56 city.no = text
57 else:
58 city.name = text
59
60 for city in self.cityList:
61 city.fetch_county_list()
62
63 def to_json(self) -> str:
64 pass
65
66
67class City(Entity):
68 province: Province
69
70 def __init__(self) -> None:
71 super().__init__()
72 self.countyList = []
73
74 def fetch_county_list(self):
75 print("%s 开始" % self.name)
76 url = "%s%s" % (Entity.baseUrl, self.link)
77 r = session.get(url)
78 r.encoding = "gbk"
79 h: requests_html.HTML = r.html
80 li: list[requests_html.Element] = h.find("a")
81 for a in li:
82 text = a.text
83 if text.__contains__("京ICP"):
84 continue
85 href_ = a.attrs["href"]
86 county = County()
87 county.link = href_
88 county.province = self
89 # print(text, href_)
90
91 try:
92 index = self.countyList.index(county)
93 county = self.countyList[index]
94 except ValueError:
95 self.countyList.append(county)
96
97 if text.isnumeric():
98 county.no = text
99 else:
100 county.name = text
101
102 for county in self.countyList:
103 # print(county.__str__())
104 pass
105
106 print("%s 结束" % self.name)
107
108 pass
109
110
111class County(Entity):
112 city: City
113 pass
114
115
116provinceList = []
117
118
119def fetch_province_list():
120 response = session.get("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html")
121 response.encoding = "gbk"
122 html: requests_html.HTML = response.html
123 # s = response.content.decode("gbk")
124 l: list = html.find("a")
125 for a in l:
126 ae: requests_html.Element = a
127 href: str = ae.attrs.get("href")
128 if href.endswith("html"):
129 province = Province()
130 province.name = ae.text.lstrip()
131 province.link = href.lstrip()
132 provinceList.append(province)
133
134
135fetch_province_list()
136
137if __name__ == '__main__':
138 for p in provinceList:
139 if p.name == "黑龙江省":
140 p.fetch_city_list()
141# session.close()
dump_data.py
1import json
2
3from city.city_get import Province, County, City, provinceList
4import datetime
5
6# for p in provinceList:
7# pr: Province = p
8version = 2
9
10di = dict()
11di["version"] = version
12now = datetime.datetime.now()
13date = datetime.datetime.strftime(now, "%Y-%m-%d %H:%M:%S")
14di["date"] = date
15di["timeStamp"] = now.timestamp()
16
17proList = []
18
19
20def make_province(p: Province):
21 p.fetch_city_list()
22 p_dict = dict()
23 city_list = []
24
25 p_dict["name"] = p.name
26
27 for city in p.cityList:
28 city: City = city
29 c_dict = dict()
30 c_dict["name"] = city.name
31 c_dict["no"] = city.no
32 city_list.append(c_dict)
33 make_city(city, c_dict)
34
35 p_dict["cityList"] = city_list
36 proList.append(p_dict)
37
38
39def make_city(city: City, city_obj: dict):
40 city.fetch_county_list()
41 li = []
42 county_list: list[County] = city.countyList
43 for county in county_list:
44 c_obj = dict()
45 c_obj["name"] = county.name
46 c_obj["no"] = county.no
47 li.append(c_obj)
48
49 city_obj["countyList"] = li
50 pass
51
52
53for province in provinceList:
54 print("province = %s" % province.name)
55 make_province(province)
56
57di["provinceList"] = proList
58
59s = json.dumps(di)
60
61f = open("data/city-version-%s.json" % version, 'w')
62
63f.write(s)
分了两个文件,其中一个是获取数据,一个是将数据转为 json 形式保存
如果后续有必要,也可以弄一个数据库,具体是 sqlite 还是 mysql 都可以自己解析 json 插入,对于一个合格的程序员都是小意思
代码
代码可以从 github 仓库查看
生成数据
生成的数据比较大,大概有 22w 字符 200 多 K
可以从 github release 下载
或直接从 city-version-4.json copy
格式化完的数据有 14000 行左右, 可以查看 pretty-json