''' @File : 找小说_多线程.py @Time : 2020/05/24 21:44:25 @Author : sorrowfeng @Version : 1.0 @Contact : 1399600304@qq.com @WebSite : https://sorrowfeng.github.io '''
import requests from lxml import etree import os import threading import time
search_url = "http://www.xbiquge.la/modules/article/waps.php?searchkey=" all_url = "http://www.xbiquge.la/xiaoshuodaquan/"
threads = []
def list_dic(list1,list2): ''' two lists merge a dict,a list as key,other list as value :param list1:key :param list2:value :return:dict ''' dic = dict(map(lambda x,y:[x,y], list1,list2)) return dic
class myThread (threading.Thread): def __init__(self, zhang_name, zhang_url, shu_name): threading.Thread.__init__(self) self.zhang_name = zhang_name self.zhang_url = zhang_url self.shu_name = shu_name def run(self): spider.requests_data(self.zhang_name, self.zhang_url, self.shu_name)
class Spider: word = '' file_name = '' download_num = 0
def __init__(self, word): self.word = word
def start_requests(self): start_url = search_url + str(self.word) response = requests.get(start_url) response.encoding="utf-8" myhtml = etree.HTML(response.text) name_list = myhtml.xpath("//div[@id='content']/form/table[@class='grid']/tr/td[@class='even'][1]/a/text()") url_list = myhtml.xpath("//div[@id='content']/form/table[@class='grid']/tr/td[@class='even'][1]/a/@href") author_list = myhtml.xpath("//div[@id='content']/form/table[@class='grid']/tr/td[@class='even'][2]/text()") name_url_dic = list_dic(name_list, url_list) name_author_dic = list_dic(name_list, author_list)
if not name_url_dic: print("没有找到此书") os.system('pause') return
num = 0 find_dic = {} print('\n') for name in name_url_dic: num += 1 find_dic[str(num)]=name print(str(num) + '. ' + name + ('\t\t作者: ' + str(name_author_dic[name]))) book_num = input('\n你想下载的是(请输入序号):') print('\n\n') shu_name = find_dic[book_num] shu_url = name_url_dic.get(shu_name) print('正在为你下载:'+shu_name+'\n')
if os.path.exists(shu_name) == False: os.mkdir(shu_name)
self.requests_zhang(shu_name,shu_url)
def requests_zhang(self,shu_name,shu_url): response = requests.get(shu_url) response.encoding='utf-8' html = etree.HTML(response.text) zhang_name_list = html.xpath('//div[@id="list"]/dl/dd/a/text()') zhang_url_list =html.xpath('//div[@id="list"]/dl/dd/a/@href') self.all_zhang_num = len(zhang_name_list)
for zhang_name,zhang_url in zip(zhang_name_list,zhang_url_list): thread_num = myThread(zhang_name, zhang_url, shu_name) time.sleep(0.015) thread_num.start() threads.append(thread_num) time.sleep(0.015)
while not threads: pass
for t in threads: time.sleep(0.01) t.join() print('\n下载完成!存放于' + os.getcwd() + '\n') print('正在准备退出') os.system('pause')
def requests_data(self,zhang_name,zhang_url,shu_name): data_url = 'http://www.xbiquge.la'+zhang_url response = requests.get(data_url) response.encoding='utf-8' html = etree.HTML(response.text)
content = "\n".join(html.xpath('//div[@id="content"]/text()')) self.download_num += 1
self.file_name = shu_name + '\\' + zhang_name + '.txt' print("正在下载:" + zhang_name + "\t已下载 " + str(round((100*self.download_num/self.all_zhang_num), 2)) + "%") with open(self.file_name,"a",encoding='utf-8') as f: f.write('\n\n\n'+str(zhang_name)+'\n\n\n') f.write(content)
if __name__ == '__main__': word = input('请输入要下载的书名(或作者):') spider = Spider(word) spider.start_requests()
|