Saturday, May 16, 2015

python 做网络爬虫下载数据

#!usr/bin/python

import re
import urllib

def getHtml(url):
    page=urllib.urlopen(url)
    html=page.read()
    return html

def getImg(html):
    reg=r'(http://cache.+)" alt'
    imgre=re.compile(reg)
    imglist=re.findall(imgre,html)
    x=10
    for imgurl in imglist:
        trimurl=imgurl.replace("amp;","")
        urllib.urlretrieve(trimurl, '%s.jpg' % x)
        x=x+1
        print trimurl
   

html=getHtml("http://www.photos.com/prints/photographers/the-complete-slim-aarons-collection")

getImg(html)

No comments:

Post a Comment