1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
| import requests import re import time import hashlib
def get_page(url): print('GET %s' %url) try: response=requests.get(url) if response.status_code == 200: return response.content except Exception: pass
def parse_index(res): obj=re.compile('class="items.*?<a href="(.*?)"',re.S) detail_urls=obj.findall(res.decode('gbk')) for detail_url in detail_urls: if not detail_url.startswith('http'): detail_url='http://www.xiaohuar.com'+detail_url yield detail_url
def parse_detail(res): obj=re.compile('id="media".*?src="(.*?)"',re.S) res=obj.findall(res.decode('gbk')) if len(res) > 0: movie_url=res[0] return movie_url
def save(movie_url): response=requests.get(movie_url,stream=False) if response.status_code == 200: m=hashlib.md5() m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8')) filename=m.hexdigest() with open(r'./movies/%s.mp4' %filename,'wb') as f: f.write(response.content) f.flush()
def main(): index_url='http://www.xiaohuar.com/list-3-{0}.html' for i in range(5): print('*'*50,i) index_page=get_page(index_url.format(i,)) detail_urls=parse_index(index_page) for detail_url in detail_urls: detail_page=get_page(detail_url) movie_url=parse_detail(detail_page) if movie_url: save(movie_url)
if __name__ == '__main__': main()
from concurrent.futures import ThreadPoolExecutor import queue import requests import re import time import hashlib from threading import current_thread
p=ThreadPoolExecutor(50)
def get_page(url): print('%s GET %s' %(current_thread().getName(),url)) try: response=requests.get(url) if response.status_code == 200: return response.content except Exception as e: print(e)
def parse_index(res): print('%s parse index ' %current_thread().getName()) res=res.result() obj=re.compile('class="items.*?<a href="(.*?)"',re.S) detail_urls=obj.findall(res.decode('gbk')) for detail_url in detail_urls: if not detail_url.startswith('http'): detail_url='http://www.xiaohuar.com'+detail_url p.submit(get_page,detail_url).add_done_callback(parse_detail)
def parse_detail(res): print('%s parse detail ' %current_thread().getName()) res=res.result() obj=re.compile('id="media".*?src="(.*?)"',re.S) res=obj.findall(res.decode('gbk')) if len(res) > 0: movie_url=res[0] print('MOVIE_URL: ',movie_url) with open('db.txt','a') as f: f.write('%s\n' %movie_url) p.submit(save,movie_url) print('%s下载任务已经提交' %movie_url) def save(movie_url): print('%s SAVE: %s' %(current_thread().getName(),movie_url)) try: response=requests.get(movie_url,stream=False) if response.status_code == 200: m=hashlib.md5() m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8')) filename=m.hexdigest() with open(r'./movies/%s.mp4' %filename,'wb') as f: f.write(response.content) f.flush() except Exception as e: print(e)
def main(): index_url='http://www.xiaohuar.com/list-3-{0}.html' for i in range(5): p.submit(get_page,index_url.format(i,)).add_done_callback(parse_index)
if __name__ == '__main__': main()
|