爬取微博(weibo.cn)指定用户的关注者,最多200条记录(20页,每页10条记录)。
# -*- coding: utf-8 -*-
import requests
import urllib2
from lxml import etree
import os
import time
import codecs
import re
from multiprocessing.dummy import Pool
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# write text to .txt
def tosave(texta):
fn = 'following'
pa = os.path.dirname(__file__) + '/' + username
# check and create folder
if not os.path.exists(pa):
os.mkdir(pa)
fl = pa + '/%s.txt' % fn
f = codecs.open(fl, 'a', 'utf-8')
f.write(texta + '\n')
f.close()
def getHtml(url, cook):
html = requests.get(url, cookies=cook).content
selector = etree.HTML(html)
return selector
def getContent(htm, xpathStr):
selector = htm
content = selector.xpath(xpathStr) # copy from chrome # print content
return content
def getPageNum(htm):
xps='//*[@id="pagelist"]/form/div/input[1]//@value'
pnum = getContent(htm, xps)
print pnum[0]
return int(pnum[0])
cook = {"Cookie": "XXXXXX"}
username = 'XXXXX'
url0='http://weibo.cn/%s/follow' % username
print url0
htm = getHtml(url0, cook)
pagenum=getPageNum(htm)
for pn in range(1, pagenum):
urls = url0 + '?page=%s' % pn
print urls
htms = getHtml(urls, cook)
xptxt = '//td[2]//a[1]' # homepage contains many tables
xphref='//td[2]//a[1]//@href' # href
consTxt = getContent(htms, xptxt)
consHref=getContent(htms,xphref)
for indx in range(1,len(consTxt)):
followTxt=consTxt[indx].text + ' ' + consHref[indx]
print followTxt
tosave(followTxt)