Python网络爬虫实战之十:利用API进行数据采集

目录:Python网络爬虫实战系列

正文:

一、什么是API?

API(ApplicationProgrammingInterface,应用程序编程接口)是一些预先定义的函数,目的是提供应用程序与开发人员基于某软件或硬件得以访问一组例程的能力,而又无需访问源码,或理解内部工作机制的细节。

例如:
http://apis.juhe.cn/ip/ip2addr?ip=112.112.11.11&key=appkey

返回的json格式的数据是:

{
   "resultcode":"200",
   "reason":"Return Successd!",
   "result":{
      "area":"江苏省苏州市",
      "location":"电信"
   }
}

返回的xml格式的数据是:

<?xml version="1.0" encoding="utf-8" ?> 
<root>
    <resultcode>200</resultcode> 
    <reason>Return Successd!</reason> 
    <result>
        <area>江苏省苏州市</area> 
        <location>电信</location> 
    </result>
</root>

二、使用Python调用API

1、使用python对json格式的数据解析

import json

jsonString = '{"arrayOfNums":[{"number":0},{"number":1},{"number":2}],"arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
jsonObj = json.loads(jsonString)
print(jsonObj.get("arrayOfNums"))
print(jsonObj.get("arrayOfNums")[1])
print(jsonObj.get("arrayOfNums")[1].get("number") + jsonObj.get("arrayOfNums")[2].get("number"))
print(jsonObj.get("arrayOfFruits")[2].get("fruit"))

2、使用python调用聚合数据中的天气预报API

from urllib import urlencode
import urllib
import json

# 配置您申请的APPKey
appkey = "XXXXXXXXXXXXXXXXXXXXXXXX"


# 根据城市查询天气
def queryWeather(appkey, m="GET", city="广州", dtype="json"):
    url = "http://v.juhe.cn/weather/index"
    params = {
        "cityname": city,  # 要查询的城市,如:温州、上海、北京
        "key": appkey,  # 应用APPKEY(应用详细页查询)
        "dtype": dtype,  # 返回数据的格式,xml或json,默认json
    }
    params = urlencode(params, )
    if m == "GET":
        f = urllib.urlopen("%s?%s" % (url, params))
    else:
        f = urllib.urlopen(url, params)

    content = f.read()
    res = json.loads(content)
    if res:
        error_code = res["error_code"]
        if error_code == 0:
            # 成功请求
            return res["result"]
        else:
            print "%s:%s" % (res["error_code"], res["reason"])
    else:
        print "request api error"


weather = queryWeather(appkey, "GET")
print weather
print urllib.unquote(weather.get("sk").get("wind_direction"))

3、使用python调用聚合数据中的查询IP地址API

from urllib import urlopen
import json


def getCountry(ipAddress, appkey):
    response = urlopen("http://apis.juhe.cn/ip/ip2addr?ip=" + ipAddress + "&key=" + appkey).read().decode('utf-8')
    responseJson = json.loads(response)
    return responseJson.get("area")


# 配置您申请的APPKey
appkey = "84bd1042092e7b0e3265483f46febc80"
print(getCountry("61.135.169.121", appkey))

4、使用python 2.x 调用微博API

微博的Python 2.x SDK:
http://github.liaoxuefeng.com/sinaweibopy/
https://github.com/michaelliao/sinaweibopy

安装sdk

pip install sinaweibopy

实例代码

from weibo import APIClient
import webbrowser

## 1、个人微博的账号信息
APP_KEY = 'XXXXXX'
APP_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXX'
CALLBACK_URL = 'http://f.dataguru.cn'

## 2、请求授权
# 2.1
client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL)
url = client.get_authorize_url()
# print(url)
# https://api.weibo.com/oauth2/authorize?redirect_uri=http%3A//f.dataguru.cn&response_type=code&client_id=2337575664
# 2.2
# 打开申请授权的网页,点击同意授权后会跳转到之前设置的回调网页(即CALLBACK_URL)
# 在回调页的浏览器地址栏里获取code(动态变化),用于第二步调用oauth2/access_token接口,获取授权后的access token
webbrowser.open_new(url)
# http://f.dataguru.cn/?code=6240f86a9c757ef6ea985cd28647f05a
code = '6240f86a9c757ef6ea985cd28647f05a'

## 3、获得授权
# 获取token 和 token的生命周期
r = client.request_access_token(code)
# print(r)
access_token = r.access_token
# print(access_token)
expires_in = r.expires_in
# print(expires_in)

## 4、为以后的API请求设置token
client.set_access_token(access_token, expires_in)

## 5、获取当前登录用户及其所关注(授权)用户的最新微博 statuses/home_timeline
# https://api.weibo.com/2/statuses/home_timeline.json
statuses = client.statuses.home_timeline.get(count=10)['statuses']
# print(statuses[1])
length = len(statuses)
print(length)
# 输出了部分信息
for i in range(0, length):
    print(u'昵称:' + statuses[i]['user']['screen_name'])
    print(u'简介:' + statuses[i]['user']['description'])
    print(u'位置:' + statuses[i]['user']['location'])
    print(u'微博:' + statuses[i]['text'])

## 6、获取最新的提到登录用户的微博列表,即@我的微博 statuses/mentions
# https://api.weibo.com/2/statuses/mentions.json
statuses = client.statuses.mentions.get()['statuses']
# print(statuses[1])
length = len(statuses)
print(length)
# 输出了部分信息
for i in range(0, length):
    print(u'昵称:' + statuses[i]['user']['screen_name'])
    print(u'简介:' + statuses[i]['user']['description'])
    print(u'位置:' + statuses[i]['user']['location'])
    print(u'微博:' + statuses[i]['text'])
    print(u'时间:' + statuses[i]['created_at'])

5、使用python 3.x 调用微博API

微博的Python 3.x SDK:
https://github.com/nooperpudd/weibopy

安装sdk

pip install weibopy

实例代码

from weibopy import WeiboOauth2
import webbrowser

## 1、个人微博的账号信息
APP_KEY = 'XXXXXX'
APP_SECRET = 'XXXXXXXXXXXX'
CALLBACK_URL = 'http://f.dataguru.cn'

## 2、请求授权
# 2.1
client = WeiboOauth2(APP_KEY, APP_SECRET, CALLBACK_URL)
authorize_url = client.authorize_url
print(authorize_url)
# https://api.weibo.com/oauth2/authorize?redirect_uri=http%3A//f.dataguru.cn&response_type=code&client_id=2337575664
# 2.2
# 打开申请授权的网页,点击同意授权后会跳转到之前设置的回调网页(即CALLBACK_URL)
# 在回调页的浏览器地址栏里获取code(动态变化),用于第二步调用oauth2/access_token接口,获取授权后的access token
webbrowser.open_new(authorize_url)
# http://f.dataguru.cn/?code=4b156593e9dfdd16279bbcc9eb7817bf
code = '4b156593e9dfdd16279bbcc9eb7817bf'

## 3、获得授权
# 获取token 和 token的生命周期
r = client.auth_access(code)
# print(r)
access_token = r.get("access_token")
# print(access_token)
expires_in = r.expires_in
# print(expires_in)

## 4、为以后的API请求设置token
from weibopy import WeiboClient

client = WeiboClient(access_token)

## 5、获取当前登录用户及其所关注(授权)用户的最新微博 statuses/home_timeline
# https://api.weibo.com/2/statuses/home_timeline.json
result = client.get(suffix="statuses/home_timeline.json")
statuses = result.get("statuses")
# print(statuses[0])
length = len(statuses)
print(length)
# 输出了部分信息
for i in range(0, length):
    print(u'昵称:' + statuses[i]['user']['screen_name'])
    print(u'简介:' + statuses[i]['user']['description'])
    print(u'位置:' + statuses[i]['user']['location'])
    print(u'微博:' + statuses[i]['text'])

## 6、获取最新的提到登录用户的微博列表,即@我的微博 statuses/mentions
# https://api.weibo.com/2/statuses/mentions.json
result = client.get(suffix="statuses/mentions.json")
statuses = result.get("statuses")
# print(statuses[0])
length = len(statuses)
print(length)
# 输出了部分信息
for i in range(0, length):
    print(u'昵称:' + statuses[i]['user']['screen_name'])
    print(u'简介:' + statuses[i]['user']['description'])
    print(u'位置:' + statuses[i]['user']['location'])
    print(u'微博:' + statuses[i]['text'])
    print(u'时间:' + statuses[i]['created_at'])

作者:麦典威
链接:https://www.jianshu.com/p/b4cea92c04a2
来源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。