Python爬取微博关注

爬取微博(weibo.cn)指定用户的关注者,最多200条记录(20页,每页10条记录)。

# -*- coding: utf-8 -*-

import requests
import urllib2
from lxml import etree
import os
import time
import codecs
import re
from multiprocessing.dummy import  Pool
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')

# write text to .txt
def tosave(texta):
    fn = 'following'
    pa = os.path.dirname(__file__) + '/' + username
    # check and create folder
    if not os.path.exists(pa):
        os.mkdir(pa)
    fl = pa + '/%s.txt' % fn
    f = codecs.open(fl, 'a', 'utf-8')
    f.write(texta + '\n')
    f.close()

def getHtml(url, cook):
    html = requests.get(url, cookies=cook).content
    selector = etree.HTML(html)
    return selector


def getContent(htm, xpathStr):
    selector = htm
    content = selector.xpath(xpathStr)  # copy from chrome # print content
    return content

def getPageNum(htm):
    xps='//*[@id="pagelist"]/form/div/input[1]//@value'
    pnum = getContent(htm, xps)
    print pnum[0]
    return int(pnum[0])

cook = {"Cookie": "XXXXXX"}
username = 'XXXXX' 
url0='http://weibo.cn/%s/follow' % username
print url0
htm = getHtml(url0, cook)
pagenum=getPageNum(htm)
for pn in range(1, pagenum):
    urls = url0 + '?page=%s' % pn
    print urls
    htms = getHtml(urls, cook)
    xptxt = '//td[2]//a[1]'  # homepage contains many tables
    xphref='//td[2]//a[1]//@href' # href
    consTxt = getContent(htms, xptxt)
    consHref=getContent(htms,xphref)
    for indx in range(1,len(consTxt)):
        followTxt=consTxt[indx].text + '     ' + consHref[indx]
        print followTxt
        tosave(followTxt)