#coded by 伊玛目的门徒 #coding=utf-8 from wordpress_xmlrpc import Client, WordPressPost from wordpress_xmlrpc.methods.posts import GetPosts, NewPost from wordpress_xmlrpc.methods.users import GetUserInfo import time import requests from bs4 import BeautifulSoup import re from concurrent.futures import ThreadPoolExecutor start = time.clock() # 计时-开始 urllist=[] titlelist=[] header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.XXXX.XXX Safari/537.36'} def do(i): try: cd=[] html=requests.get('http://futures.hexun.com/domestic/index-'+str(i)+'.html',headers=header) html.encoding='gbk' Soup = BeautifulSoup(html.text, "lxml") #ab=Soup.select('li a[target="_blank"]') ab=Soup.select('div.temp01 ul li a[target="_blank"]') for x in range(len(ab)): if (x % 2) == 1: cd.append (ab[x]) #print ('--------------') pattern = re.compile(r'<a href="(.*?)" target="_blank">',re.S) # 查找数字 result1 = pattern.findall(str(cd)) pattern2 = re.compile(r'target="_blank">(.*?)</a>',re.S) result2 = pattern2.findall(str(cd)) #print (result1) urllist.extend(result1) #print (result2) titlelist.extend(result2) list1.remove(i) except: pass # 多线程 def multithreading(): sum=0 while len(list1)>0: with ThreadPoolExecutor(max_workers=10) as executor: for result in executor.map(do, list1): sum+=1 return sum #list1=list(range(1,393,1)) list1=list(range(392,393,1)) sum=multithreading() print ('还剩下{}页'.format(list1)) end = time.clock() # 计时-结束 print (("爬取完成 用时:")) print ((end - start)) print ('总爬取 %d 页 '%(sum)) while None in titlelist: titlelist.remove(None) while None in urllist: urllist.remove(None) #print (titlelist) #print (urllist) ''' #可作为TXT输出 with open("test.txt","w") as f: for thing in urllist: f.write(thing) f.write('\r\n') ''' def getcontent(url,j): try: print (listj) html=requests.get(url,headers=header) html.encoding='gbk' Soup = BeautifulSoup(html.text, "lxml") con=Soup.select('div.art_contextBox ') cont='' for y in con: #print (type(str(y))) cont=cont+str(y) #print (cont) #print (j) listj.remove(j) #print ('****') #print (listj) return (cont) except: pass def wpsend(title,content): wp = Client('http://www.6324.xyz/xmlrpc.php', '你的用户名', '你的密码') #print (content) post = WordPressPost() post.title = title #post.content = " ''' "+ content +" ''' " post.content = " "+ str(content) +" " post.post_status = 'publish' post.terms_names = { 'post_tag': ['操盘策略'], 'category': [ '期货'] } wp.call(NewPost(post)) localtime = time.localtime(time.time()) print ('文档已上传,执行时间 {}'.format(time.strftime("%Y-%m-%d %H:%M:%S",localtime))) def work(j): url=urllist[j] title=titlelist[j] cont=getcontent(url,j) wpsend(title,cont) print ('成功完成任务采集任务第 {}号任务'.format(j)) # 多线程 def multithreading_con(): sum=0 global listj listj=list(range(len(urllist))) #print (type(listj)) #print (listj) while len(listj)>0: with ThreadPoolExecutor(max_workers=5) as executor: for result in executor.map(work,listj ): sum+=1 return sum multithreading_con() end = time.clock() # 计时-结束 print ("全部上传完成 用时:") print ((end - start))
B站 演示视频: https://www.bilibili.com/video/av80154643/
网站战士: www.6324.xyz