2016-12-27

Python爬取arxiv的paper

每天都要去arxiv上关注最新一天的论文（computer vision）更新，而且每次下载的论文的名称都是arxiv的代号，需要花时间去整理，于是用python写一个非常简单的爬虫，自己使用，足够节省时间了。

import requests
from lxml import etree
import os
import time
import re
from multiprocessing.dummy import Pool

def getHtml(url):
    html = requests.get(url).content
    selector = etree.HTML(html)
    return selector

def getContent(htm, xpathStr):
    selector = htm
    content = selector.xpath(xpathStr)  
    return content

def getDownPdf(cons, title, folder):
    fn = '%s' % title
    pa = os.path.dirname(__file__) + '/' + 'arxiv' + '/%s' % folder
    # check and create folder
    if not os.path.exists(pa):
        os.mkdir(pa)
    fl = pa + '/%s.pdf' % fn
    r = requests.get(cons)
    with open(fl, "wb") as code:
        code.write(r.content)

#### main ###
url0 = 'http://arxiv.org/list/cs.CV/recent'
print url0
# xpath of each page
xp1 = '//dl[1]//*[@class="list-identifier"]//a[2]//@href'  # pdf href list
xp2 = '//dl[1]//*[@class="list-title"]/text()'  # Title
xp_date = '//*[@id="dlpage"]/h3[1]/text()'  # date->folder

htm0 = getHtml(url0)
cons1 = getContent(htm0, xp1)  # get pdfs' href
cons2 = getContent(htm0, xp2)  # get papers' title
cons_date = getContent(htm0, xp_date) # get date

folder = cons_date[0].split(', ') # get date string

print folder[1] + ': having %s' % len(cons1) + '  files'
print 'pdfs are downloading...'

for indx in range(0, len(cons1)):
    href = 'http://arxiv.org' + cons1[indx]
    title = cons2[2 * indx + 1]
    print '%s.' % (1 + indx) + ' ' + href + ' ' + title
    getDownPdf(href, title, folder[1])