python

说下背景，起因是因为公司业务上有一块，功能出现了问题，用户的设备端在去请求一个xml 文件的时候，发现从服务器总是下载出错，或者下载超级的慢，因为现有的环境是国外的客户的设备全部都链接到了国内的阿里云的服务器，然后导致下载异常的慢，所以现在想让过外的客户在下载文件的时候，可以判断如果设备端在国外，那么就重定向去新加坡的阿里云OSS 下载，否则国内的IP地址的用户就重定向到国内的阿里云OSS地址下载.

1. 首先，设备端是通过http请求来下载文件的，所以我唯一可以知道的是设备端连接过来的remoteaddr 地址.
2. 设个时候我就可以通过remoteaddr 地址去判断用户的设备到底处于国内还是国外，因为用户的设备有可能是移动的
3. 这个时候就找到了一个淘宝的IP地址库查询接口：http://ip.taobao.com/ipSearch.php
4. 就根据这个区请求查询IP地址的位置，然后做相应的地址重定向
5. 存在的问题，淘宝IP地址库的查询请求是有频率限制的，所以会存在频繁查询查询失败的情况，这个时候是默认跳新加坡的，因为我们主要的客户在国外

好了一下说正事~

淘宝的IP库，看起来就是给出了比较详细的信息：

{
    "code": 0,
    "data": {
        "country": "中国",
        "country_id": "CN",
        "area": "华南",
        "area_id": "800000",
        "region": "广东省",
        "region_id": "440000",
        "city": "广州市",
        "city_id": "440100",
        "county": "",
        "county_id": "-1",
        "isp": "电信",
        "isp_id": "100017",
        "ip": "14.215.177.38"
    }
}

{

"code": 0,

"data": {

"country": "中国",

"country_id": "CN",

"area": "华南",

"area_id": "800000",

"region": "广东省",

"region_id": "440000",

"city": "广州市",

"city_id": "440100",

"county": "",

"county_id": "-1",

"isp": "电信",

"isp_id": "100017",

"ip": "14.215.177.38"

}

这个时候，我就像做一个我自己的地址库，然后让别人来查，看了下，上面有的信息，这时候网上了查了下，好像可行，就开始动手了.
需要的信息：
1. 国家，国家代码
2. 省，省代码
3. 市，市代码
4. isp
5. IP地址库
相应的地址，在代码里面有，需要的可以看代码的请求地址

去网上搜索了一下，好像这些大概都可以找到,接下来就去爬下来就好了（无奈IP地址库的信息，现在我只找到了省级以上的地址库的信息，最后也没找全.

国家代码再维基百科上爬的

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib2
import time
import re
from bs4 import BeautifulSoup


class HtmlDownloader(object):
    header = {'Cookie': 'AD_RS_COOKIE=20083363',
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
              AppleWeb\Kit/537.36 (KHTML, like Gecko)\
              Chrome/58.0.3029.110 Safari/537.36'}
    def download(self,url):
        if url is None:
            raise Exception('url is None')
        # print url
        request = urllib2.Request(url,None,HtmlDownloader.header)
        try:
            resp = urllib2.urlopen(request)
            # print resp.getcode()
            if resp.getcode()!= 200:
                time.sleep(5)
                return self.download(url)
            else:
                return resp.read()
        except urllib2.URLError,e:
            print e
            time.sleep(5)
            return self.download(url)
    def readhtml(self,filename):
        file_object = open(filename)
        try:
            all_the_text = file_object.read()
        finally:
            file_object.close()
        return all_the_text



class HtmlParser(object):
    def has_tag(self,tag):
        return tag.has_attr('span')
    def region_parser(self,html_content):
        if html_content is None:
            raise Exception('html is None')
        soup = BeautifulSoup(html_content,'html.parser')
        for tag in soup.find_all(class_="MsoNormal"):
            # print tag.get_text()
            id = tag.get_text().split(" ")[0].strip()
            name = tag.get_text().split(" ")[1].strip()
            print id+"-->"+name
    def contry_parse(self,html_content):
        if html_content is None:
            raise Exception('html is None')
        soup = BeautifulSoup(html_content, 'html.parser')
        for tag in soup.find_all(class_="wikitable sortable"):
            # print tag
            # print tag.select('td')
            i = 0
            for td in tag.select('td'):
                if i % 5 == 0:
                    print "id-->"+td.get_text().strip()
                elif i % 5 == 4:
                    print "name-->"+td.get_text().strip()
                # print str(i)+"----"+td.get_text().strip()
                i = i + 1
            # code = tag.get_text().split(" ")[0].strip()
            # name = tag.get_text().split(" ")[3].strip()
            # print code + "-->" + name
    def contry_ipaddrlink_parse(self,html_content):
        if html_content is None:
            raise Exception('html is None')
        soup = BeautifulSoup(html_content, 'html.parser')
        for tag in soup.find_all(href=re.compile(u'http://ipblock.chacuo.net/view/.*')):
            print  tag.get_text()+"-->"+tag.get('href')
            # html_content = html_downloader.download(tag.get('href'))
            # print tag

    def ipaddress_parse(self,html_content):
        if html_content is None:
            raise Exception('html is None')
        soup = BeautifulSoup(html_content, 'html.parser')
        for tag in soup.find_all('pre'):
            # print tag.get_text()+"-->"
            return tag.get_text()

    def ipaddress_parse_text(self,html_content):
        if html_content is None:
            raise Exception('html is None')
        soup = BeautifulSoup(html_content, 'html.parser')
        for tag in soup.find_all(href=re.compile(u'http://ipblock.chacuo.net/view/.*')):
            # print re.sub(r'view/c_', "down/t_txt=c_", tag.get('href'))
            content = html_downloader.download(re.sub(r'view/c_', "down/t_txt=c_", tag.get('href')))
            try:
                print tag.get_text()
                contentstr = self.ipaddress_parse(content)
                # print contentstr
                for ipdata in contentstr.split('\r\n'):
                    data = ipdata.split('\t')
                    # print data[1]
                    # print data[0]
                    # print '-->'+ipdata
                    # print ipdata.split('\t')[0]
                    # print data[0].strip()
                    if len(data) >3:
                        print '--->ip:'+data[0]+'--->mask:'+data[1]+'-->mask/len:'+data[2]+'-->num:'+data[3]
            except Exception,e:
                print "no data"

    def s_ipaddress_parse_text(self, html_content):
        if html_content is None:
            raise Exception('html is None')
        soup = BeautifulSoup(html_content, 'html.parser')
        for tag in soup.find_all(href=re.compile(u'http://ips.chacuo.net/view/.*')):
            # print re.sub(r'view/c_', "down/t_txt=c_", tag.get('href'))
            content = html_downloader.download(re.sub(r'view/s_', "down/t_txt=p_", tag.get('href')))
            # print tag.get('href')+"--->"
            try:
                print tag.get_text()
                contentstr = self.ipaddress_parse(content)
                # print contentstr
                for ipdata in contentstr.split('\r\n'):
                    # print ipdata
                    data = ipdata.split('\t')
                    # print data
                #     # print data[1]
                #     # print data[0]
                #     # print '-->'+ipdata
                #     # print ipdata.split('\t')[0]
                #     # print data[0].strip()
                    if len(data) > 2:
                        print '--->ip:' + data[0] + '--->mask:' + data[1] + '-->num:' + data[2]
            except Exception, e:
                print "no data"
    def isp_ipaddress_parse_text(self, html_content):
        if html_content is None:
            raise Exception('html is None')
        soup = BeautifulSoup(html_content, 'html.parser')
        for tag in soup.find_all(href=re.compile(u'http://ipcn.chacuo.net/view/.*')):
            # print re.sub(r'view/c_', "down/t_txt=c_", tag.get('href'))
            content = html_downloader.download(re.sub(r'view/i_', "down/t_txt=c_", tag.get('href')))
            # print tag.get('href')+"--->"
            try:
                print tag.get_text()
                contentstr = self.ipaddress_parse(content)
                # print contentstr
                for ipdata in contentstr.split('\r\n'):
                    # print ipdata
                    data = ipdata.split('\t')
                    # print data
                #     # print data[1]
                #     # print data[0]
                #     # print '-->'+ipdata
                #     # print ipdata.split('\t')[0]
                #     # print data[0].strip()
                    if len(data) > 2:
                        print '--->ip:' + data[0] + '--->mask:' + data[1] + '-->num:' + data[2]
            except Exception, e:
                print "no data"

if __name__ == '__main__':
    html_downloader = HtmlDownloader()

    #region
    # html_content = html_downloader.download('http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html')
    # html_parser = HtmlParser()
    # html_parser.region_parser(html_content)

    #contry
    # html_content = html_downloader.readhtml('ISO3166-1.html')
    # html_parser = HtmlParser()
    # html_parser.contry_parse(html_content)

    #contry ip address link parse
    # html_content = html_downloader.download('http://ipblock.chacuo.net')
    # html_parser = HtmlParser()
    # html_parser.contry_ipaddrlink_parse(html_content)

    #contry ip address parse
    # html_content = html_downloader.download('http://ipblock.chacuo.net/down/t_txt=c_AO')
    # html_parser = HtmlParser()
    # html_parser.ipaddress_parse(html_content)

    # contry ip address parse to text
    # html_content = html_downloader.download('http://ipblock.chacuo.net')
    # html_parser = HtmlParser()
    # html_parser.ipaddress_parse_text(html_content)

    #cn s ipaddress parse
    html_content = html_downloader.download('http://ips.chacuo.net/')
    html_parser = HtmlParser()
    html_parser.s_ipaddress_parse_text(html_content)

    # cn s ipaddress parse
    html_content = html_downloader.download('http://ipcn.chacuo.net/')
    html_parser = HtmlParser()
    html_parser.isp_ipaddress_parse_text(html_content)

    # print html_content

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import urllib2

import time

import re

from bs4 import BeautifulSoup

class HtmlDownloader(object):

header = {'Cookie': 'AD_RS_COOKIE=20083363',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \

AppleWeb\Kit/537.36 (KHTML, like Gecko)\

Chrome/58.0.3029.110 Safari/537.36'}

def download(self,url):

if url is None:

raise Exception('url is None')

# print url

request = urllib2.Request(url,None,HtmlDownloader.header)

try:

resp = urllib2.urlopen(request)

# print resp.getcode()

if resp.getcode()!= 200:

time.sleep(5)

return self.download(url)

else:

return resp.read()

except urllib2.URLError,e:

print e

time.sleep(5)

return self.download(url)

def readhtml(self,filename):

file_object = open(filename)

try:

all_the_text = file_object.read()

finally:

file_object.close()

return all_the_text

class HtmlParser(object):

def has_tag(self,tag):

return tag.has_attr('span')

def region_parser(self,html_content):

if html_content is None:

raise Exception('html is None')

soup = BeautifulSoup(html_content,'html.parser')

for tag in soup.find_all(class_="MsoNormal"):

# print tag.get_text()

id = tag.get_text().split(" ")[0].strip()

name = tag.get_text().split(" ")[1].strip()

print id+"-->"+name

def contry_parse(self,html_content):

if html_content is None:

raise Exception('html is None')

soup = BeautifulSoup(html_content, 'html.parser')

for tag in soup.find_all(class_="wikitable sortable"):

# print tag

# print tag.select('td')

i = 0

for td in tag.select('td'):

if i % 5 == 0:

print "id-->"+td.get_text().strip()

elif i % 5 == 4:

print "name-->"+td.get_text().strip()

# print str(i)+"----"+td.get_text().strip()

i = i + 1

# code = tag.get_text().split(" ")[0].strip()

# name = tag.get_text().split(" ")[3].strip()

# print code + "-->" + name

def contry_ipaddrlink_parse(self,html_content):

if html_content is None:

raise Exception('html is None')

soup = BeautifulSoup(html_content, 'html.parser')

for tag in soup.find_all(href=re.compile(u'http://ipblock.chacuo.net/view/.*')):

print tag.get_text()+"-->"+tag.get('href')

# html_content = html_downloader.download(tag.get('href'))

# print tag

def ipaddress_parse(self,html_content):

if html_content is None:

raise Exception('html is None')

soup = BeautifulSoup(html_content, 'html.parser')

for tag in soup.find_all('pre'):

# print tag.get_text()+"-->"

return tag.get_text()

def ipaddress_parse_text(self,html_content):

if html_content is None:

raise Exception('html is None')

soup = BeautifulSoup(html_content, 'html.parser')

for tag in soup.find_all(href=re.compile(u'http://ipblock.chacuo.net/view/.*')):

# print re.sub(r'view/c_', "down/t_txt=c_", tag.get('href'))

content = html_downloader.download(re.sub(r'view/c_', "down/t_txt=c_", tag.get('href')))

try:

print tag.get_text()

contentstr = self.ipaddress_parse(content)

# print contentstr

for ipdata in contentstr.split('\r\n'):

data = ipdata.split('\t')

# print data[1]

# print data[0]

# print '-->'+ipdata

# print ipdata.split('\t')[0]

# print data[0].strip()

if len(data) >3:

print '--->ip:'+data[0]+'--->mask:'+data[1]+'-->mask/len:'+data[2]+'-->num:'+data[3]

except Exception,e:

print "no data"

def s_ipaddress_parse_text(self, html_content):

if html_content is None:

raise Exception('html is None')

soup = BeautifulSoup(html_content, 'html.parser')

for tag in soup.find_all(href=re.compile(u'http://ips.chacuo.net/view/.*')):

# print re.sub(r'view/c_', "down/t_txt=c_", tag.get('href'))

content = html_downloader.download(re.sub(r'view/s_', "down/t_txt=p_", tag.get('href')))

# print tag.get('href')+"--->"

try:

print tag.get_text()

contentstr = self.ipaddress_parse(content)

# print contentstr

for ipdata in contentstr.split('\r\n'):

# print ipdata

data = ipdata.split('\t')

# print data

# # print data[1]

# # print data[0]

# # print '-->'+ipdata

# # print ipdata.split('\t')[0]

# # print data[0].strip()

if len(data) > 2:

print '--->ip:' + data[0] + '--->mask:' + data[1] + '-->num:' + data[2]

except Exception, e:

print "no data"

def isp_ipaddress_parse_text(self, html_content):

if html_content is None:

raise Exception('html is None')

soup = BeautifulSoup(html_content, 'html.parser')

for tag in soup.find_all(href=re.compile(u'http://ipcn.chacuo.net/view/.*')):

# print re.sub(r'view/c_', "down/t_txt=c_", tag.get('href'))

content = html_downloader.download(re.sub(r'view/i_', "down/t_txt=c_", tag.get('href')))

# print tag.get('href')+"--->"

try:

print tag.get_text()

contentstr = self.ipaddress_parse(content)

# print contentstr

for ipdata in contentstr.split('\r\n'):

# print ipdata

data = ipdata.split('\t')

# print data

# # print data[1]

# # print data[0]

# # print '-->'+ipdata

# # print ipdata.split('\t')[0]

# # print data[0].strip()

if len(data) > 2:

print '--->ip:' + data[0] + '--->mask:' + data[1] + '-->num:' + data[2]

except Exception, e:

print "no data"

if __name__ == '__main__':

html_downloader = HtmlDownloader()

#region

# html_content = html_downloader.download('http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html')

# html_parser = HtmlParser()

# html_parser.region_parser(html_content)

#contry

# html_content = html_downloader.readhtml('ISO3166-1.html')

# html_parser = HtmlParser()

# html_parser.contry_parse(html_content)

#contry ip address link parse

# html_content = html_downloader.download('http://ipblock.chacuo.net')

# html_parser = HtmlParser()

# html_parser.contry_ipaddrlink_parse(html_content)

#contry ip address parse

# html_content = html_downloader.download('http://ipblock.chacuo.net/down/t_txt=c_AO')

# html_parser = HtmlParser()

# html_parser.ipaddress_parse(html_content)

# contry ip address parse to text

# html_content = html_downloader.download('http://ipblock.chacuo.net')

# html_parser = HtmlParser()

# html_parser.ipaddress_parse_text(html_content)

#cn s ipaddress parse

html_content = html_downloader.download('http://ips.chacuo.net/')

html_parser = HtmlParser()

html_parser.s_ipaddress_parse_text(html_content)

# cn s ipaddress parse

html_content = html_downloader.download('http://ipcn.chacuo.net/')

html_parser = HtmlParser()

html_parser.isp_ipaddress_parse_text(html_content)

# print html_content

个人经验要点：
1. 爬基本的信息的时候，如果遇到整页信息的，其实可以不用http 请求，特别是想国外网站的（维基百科）,不科学上网还请求不下来，这个时候就直接手动复制一下，然后读入解析就好了，我维基百科就是直接辅助文件，然后解析文件的

2. 遇到二级或者三级页面的时候，可以自己手动点击一下，然后看看页面的跳转，因为大批量类似页面的时候，有时候可能只需要改变页面的一个字符就可以直接请求了

3. 关键点在解析部分，这里我用的是python + BeautifulSoup 爬的，之前我想用go爬，却发现做正则表达式匹配的时候非常困难，然后爬了一个就改为用python了
BeautifulSoup 好像可以直接过滤掉&nbsp这类的字符，然后有很多的接口可以直接调用，获取到title 之类的html 标签，很方便

4. 当爬到纯文本的时候，这个时候要读取行或者列的时候，用字符串的分隔，分成数组，来挑选其中需要的项，我觉得这样是比较方便的。

5. 注意请求头要加一下一些基本的http 请求头信息，否则有的网站会识别，然后不会回应你.

6. 封装请求html 下载页面内容的方法，再解析想要的内容，存入数据库即可。

7. 服务部分就可以直接写服务，读取相应的数据库，查询，提供服务即可。

^画※哲^

互联网让世界没有陌生人，只有还没认识的小伙伴～

简单的爬虫实验

一	二	三	四	五	六	日
« 1月
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31