请教一道 Python python爬虫多线程pool爬虫的面试题

点击联系发帖人 时间：2017-07-24 01:38

java多线程爬虫原理

简单多线程爬虫问题【python吧】_百度贴吧
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&签到排名：今日本吧第个签到，本吧因你更精彩，明天继续来努力！
本吧签到人数：0成为超级会员，使用一键签到本月漏签0次！成为超级会员，赠送8张补签卡连续签到：天&&累计签到：天超级会员单次开通12个月以上，赠送连续签到卡3张
关注：153,943贴子：
简单多线程爬虫问题收藏
本人初学python，还请知道的吧友不吝赐教。代码每次运行到spider函数内的content=each.xpath()这个地方的时候就报错：Traceback (most recent call last):
File &E:/data/C++/Python/??????/XPath-and-multithreading-crawler_v1/???/tiebaspider.py&, line 50, in &module&
results = pool.map(spider, page)
File &E:\data\C++\Python\python2.7.9\lib\multiprocessing\pool.py&, line 251, in map
return self.map_async(func, iterable, chunksize).get()
File &E:\data\C++\Python\python2.7.9\lib\multiprocessing\pool.py&, line 558, in get
raise self._valueIndexError: list index out of range以下为源代码#-*-coding:utf8-*-from lxml import etreefrom multiprocessing.dummy import Pool as ThreadPoolimport requestsimport jsonimport sysimport arrayreload(syssys.setdefaultencoding('utf-8'def towrite(contentdict):
f.writelines(u'回帖时间:' + str(contentdict['topic_reply_time']) + '\n')
f.writelines(u'回帖内容:' + array.tounicode(contentdict['topic_reply_content']) + '\n')
f.writelines(u'回帖人:' + contentdict['user_name'] + '\n\n')def spider(url):
html = requests.get(url)
selector = etree.HTML(html.text)
print selector
content_field = selector.xpath('//div[@class=&l_post j_l_post l_post_bright
for each in content_field:
reply_info = json.loads(each.xpath('@data-field')[0].replace('&quot', ''))
author = reply_info['author']['user_name']
content = each.xpath('div[@class=&d_post_content_main&]/div/cc/div[@class=&d_post_content j_d_post_content
fix&]/text()')[0]
reply_time = reply_info['content']['date']
print (content)
print (reply_time)
print (author)
item['user_name'] = author
item['topic_reply_content'] = content
item['topic_reply_time'] = reply_time
towrite(item)if __name__ == '__main__':
pool = ThreadPool(2)
f = open('content.txt', 'a')
for i in range(1, 21):
newpage = '' + str(i)
page.append(newpage)
results = pool.map(spider, page)
pool.close()
pool.join()
python_总监级名师全程面授,项目实战案例式教学,企业需求无缝对接,助你无忧就业!python,0基础23周快速实现高薪就业,0元试听两周.名额有限,欲报从速.点击抢座
专业长期代写并讲解，诚信服务，欢迎下订单！
登录百度帐号推荐应用> 一个简单的多线程Python爬虫（一）
一个简单的多线程Python爬虫（一）
相关推荐：http://blog.csdn.net/bravezhe/article/details/8585437 Python：使用threading模块实现多线程编程分类： PYTHON
15:39 1482人阅读评论(0) 收藏举报目录()[+] Python：使用threading模块实现多线程编程一[综述] Python这门解释性语言也有专
一个简单的多线程Python爬虫最近想要抓取拉勾网的数据，最开始是使用Scrapy的，但是遇到了下面两个问题: 前端页面是用JS模板引擎生成的接口主要是用POST提交参数的目前不会处理使用JS模板引擎生成的HTML页面，用POST的提交参数的话，接口统一，也没有必要使用Scrapy，所以就萌生了自己写一个简单的Python爬虫的想法。本文中的部分链接可能需要FQ。参考资料： /developerworks/aix/library/au-threadingpython/ /questions//python-threading-how-do-i-lock-a-thread 一个爬虫的简单框架一个简单的爬虫框架，主要就是处理网络请求，Scrapy使用的是Twisted（一个事件驱动网络框架，以非阻塞的方式对网络I/O进行异步处理），这里不使用异步处理，等以后再研究这个框架。如果使用的是Python3.4及其以上版本，到可以使用asyncio这个标准库。这个简单的爬虫使用多线程来处理网络请求，使用线程来处理URL队列中的url，然后将url返回的结果保存在另一个队列中，其它线程在读取这个队列中的数据，然后写到文件中去。该爬虫主要用下面几个部分组成。 1 URL队列和结果队列将将要爬去的url放在一个队列中，这里使用标准库Queue。访问url后的结果保存在结果队列中初始化一个URL队列 from Queue import Queueurls_queue = Queue()out_queue = Queue() 2 请求线程使用多个线程，不停的取URL队列中的url，并进行处理： import threadingclass ThreadCrawl(threading.Thread):def __init__(self, queue, out_queue):threading.Thread.__init__(self)self.queue = queueself.out_queue = out_queuedef run(self):while True:item = self.queue.get()self.queue.task_down() 下面是部分标准库Queue的使用方法: Queue.get([block[, timeout]]) Remove and return an item from the queue. If optional args block is true and timeout is None (the default), block if necessary until an item is available. Queue.task_done() Indicate that a formerly enqueued task is complete. Used by queue consumer threads. For each get() used to fetch a task, a subsequent call to task_done() tells the queue that the processing on the task is complete. 如果队列为空，线程就会被阻塞，直到队列不为空。处理队列中的一条数据后，就需要通知队列已经处理完该条数据。处理线程处理结果队列中的数据，并保存到文件中。如果使用多个线程的话，必须要给文件加上锁。 lock = threading.Lock()f = codecs.open('out.txt', 'w', 'utf8') 当线程需要写入文件的时候，可以这样处理： with lock:f.write(something) 程序的执行结果运行状态：抓取结果：源码代码还不完善，将会持续修改中。 # coding: utf-8'''Author mr_zysEmail
'''from Queue import Queueimport threadingimport urllib2import timeimport jsonimport codecsfrom bs4 import BeautifulSoupurls_queue = Queue()data_queue = Queue()lock = threading.Lock()f = codecs.open('out.txt', 'w', 'utf8')class ThreadUrl(threading.Thread):def __init__(self, queue):threading.Thread.__init__(self)self.queue = queuedef run(self):passclass ThreadCrawl(threading.Thread):def __init__(self, url, queue, out_queue):threading.Thread.__init__(self)self.url = urlself.queue = queueself.out_queue = out_queuedef run(self):while True:item = self.queue.get()data = self._data_post(item)try:req = urllib2.Request(url=self.url, data=data)res = urllib2.urlopen(req)except urllib2.HTTPError, e:raise e.reasonpy_data = json.loads(res.read())res.close()item['first'] = 'false'item['pn'] = item['相关推荐：可以成功的爬取我室友的微博首页 #-*-coding:utf8-*-import requestsfrom lxml import etreecook = {&Cookie&: &此处请填写你获取到的Cookie&}url = '/u/xxxxxxxx' #此处请修改为微博网址# html = requests.get(url).content# print hpn'] + 1success = py_data['success']if success:print 'Get success...'else:print 'Get fail....'print 'pn is : %s' % item['pn']result = py_data['content']['result']if len(result) != 0:self.queue.put(item)print 'now queue size is: %d' % self.queue.qsize()self.out_queue.put(py_data['content']['result'])self.queue.task_done()def _data_post(self, item):pn = item['pn']first = 'false'if pn == 1:first = 'true'return 'first=' + first + '&pn=' + str(pn) + '&kd=' + item['kd']def _item_queue(self):passclass ThreadWrite(threading.Thread):def __init__(self, queue, lock, f):threading.Thread.__init__(self)self.queue = queueself.lock = lockself.f = fdef run(self):while True:item = self.queue.get()self._parse_data(item)self.queue.task_done()def _parse_data(self, item):for i in item:l = self._item_to_str(i)with self.lock:print 'write %s' % lself.f.write(l)def _item_to_str(self, item):positionName = item['positionName']positionType = item['positionType']workYear = item['workYear']education = item['education']jobNature = item['jobNature']companyName = item['companyName']companyLogo = item['companyLogo']industryField = item['industryField']financeStage = item['financeStage']companyShortName = item['companyShortName']city = item['city']salary = item['salary']positionFirstType = item['positionFirstType']createTime = item['createTime']positionId = item['positionId']return positionName + ' ' + positionType + ' ' + workYear + ' ' + education + ' ' + \jobNature + ' ' + companyLogo + ' ' + industryField + ' ' + financeStage + ' ' + \companyShortName + ' ' + city + ' ' + salary + ' ' + positionFirstType + ' ' + \createTime + ' ' + str(positionId) + '\n'def main():for i in range(4):t = ThreadCrawl('/jobs/positionAjax.json', urls_queue, data_queue)t.setDaemon(True)t.start()datas = [{'first': 'true', 'pn': 1, 'kd': 'Java'}#{'first': 'true', 'pn': 1, 'kd': 'Python'}]for d in datas:urls_queue.put(d)for i in range(4):t = ThreadWrite(data_queue, lock, f)t.setDaemon(True)t.start()urls_queue.join()data_queue.join()with lock:f.close()print 'data_queue siez: %d' % data_queue.qsize()main() 总结主要是熟悉使用Python的多线程编程，以及一些标准库的使用Queue、threading。
一个简单的多线程Python爬虫最近想要抓取拉勾网的数据，最开始是使用Scrapy的，但是遇到了下面两个问题: 前端页面是用JS模板引擎生成的接口主要是用POST提交参数的目前不会处理使用JS模板引
------分隔线----------------------------
相关阅读排行
相关最新文章
Copyright 2012- ( Coin163 ) All Rights Reserved &&在 SegmentFault，解决技术问题
每个月，我们帮助 1000 万的开发者解决各种各样的技术问题。并助力他们在技术能力、职业生涯、影响力上获得提升。
一线的工程师、著名开源项目的作者们，都在这里：
获取验证码
已有账号？
问题对人有帮助，内容完整，我也想知道答案
问题没有实际价值，缺少关键内容，没有改进余地
我向别人讨教，在scrapy中使用多线程提高抓取效率的问题，有人回复我说，这涉及Python全局解释器锁（GIL）的问题，我搜索了一下，发现GIL限制Python同一时间只能有一个线程运行。如果是这样的话，那threading库的存在是怎么回事？那多线程爬虫到底存在不存在？
答案对人有帮助，有参考价值
答案没帮助，是错误的答案，答非所问
确切来说：GIL 限制 Python 同一时间只能有一个线程持有 GIL。
通常在进入一些不需要 Python 解释器的 C 代码之前，库程序会/应：
cPy_BEGIN_ALLOW_THREADS
// ... Do some blocking I/O operation ...
Py_END_ALLOW_THREADS
因此回答问题：
threading 库仍然是有效的多线程，只不过在执行基于 Python 解释器的 CPU 集中的操作时，不要指望它（CPython）能充分利用多核 CPU（）；
存在的，Python 多线程可以实现并行抓取（I/O 集中），但无法用作 Python 多核并行处理（CPU 集中）。
答案对人有帮助，有参考价值
答案没帮助，是错误的答案，答非所问
因为爬虫程序90%的时间是在等啊
答案对人有帮助，有参考价值
答案没帮助，是错误的答案，答非所问
明显不是，你好好看看官方的文档
同步到新浪微博
分享到微博？
关闭理由：
删除理由：
忽略理由：
推广（招聘、广告、SEO 等）方面的内容
与已有问题重复（请编辑该提问指向已有相同问题）
答非所问，不符合答题要求
宜作评论而非答案
带有人身攻击、辱骂、仇恨等违反条款的内容
无法获得确切结果的问题
非开发直接相关的问题
非技术提问的讨论型问题
其他原因（请补充说明）
我要该，理由是：python多线程多队列（BeautifulSoup网络爬虫）
程序大概内容如下：
程序中设置两个队列分别为queue负责存放网址，out_queue负责存放网页的源代码。
ThreadUrl线程负责将队列queue中网址的源代码urlopen，存放到out_queue队列中。
DatamineThread线程负责使用BeautifulSoup模块从out_queue网页的源代码中提取出想要的内容并输出。
这只是一个基本的框架，可以根据需求继续扩展。
程序中有很详细的注释，如有有问题跪求指正啊。
import Queue
import threading
import urllib2
import time
from BeautifulSoup import BeautifulSoup
hosts = [&&,&&,&&,
queue = Queue.Queue()#存放网址的队列
out_queue = Queue.Queue()#存放网址页面的队列
class ThreadUrl(threading.Thread):
def __init__(self,queue,out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
host = self.queue.get()
url = urllib2.urlopen(host)
chunk = url.read()
self.out_queue.put(chunk)#将hosts中的页面传给out_queue
self.queue.task_done()#传入一个相当于完成一个任务
class DatamineThread(threading.Thread):
def __init__(self,out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
while True:
chunk = self.out_queue.get()
soup = BeautifulSoup(chunk)#从源代码中搜索title标签的内容
print soup.findAll(['title'])
self.out_queue.task_done()
start = time.time()
def main():
for i in range(5):
t = ThreadUrl(queue,out_queue)#线程任务就是将网址的源代码存放到out_queue队列中
t.setDaemon(True)#设置为守护线程
#将网址都存放到queue队列中
for host in hosts:
queue.put(host)
for i in range(5):
dt = DatamineThread(out_queue)#线程任务就是从源代码中解析出基于Python多线程、异步＋多进程爬虫实例代码 - 维维软件园
基于Python多线程、异步＋多进程爬虫实例代码
来源：本站整理作者：佚名时间： 8:49:54(0)
本文将向大家详细介绍Python多线程、异步＋多进程爬虫示例，有兴趣的编程爱好者可以看下。
安装Tornado
省事点可直接用grequests库，下面用的是tornado的异步client。异步用到了tornado，根据官方文档的例子修改得到一个非常简单的异步爬虫类。可参考一下最新的文档学习一下。
pip install tornado
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import time
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues
import traceback
class AsySpider(object):
&&&A simple class of asynchronous spider.&&&
def __init__(self, urls, concurrency=10, **kwargs):
urls.reverse()
self.urls = urls
self.concurrency = concurrency
self._q = queues.Queue()
self._fetching = set()
self._fetched = set()
def fetch(self, url, **kwargs):
fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
return fetch(url, **kwargs)
def handle_html(self, url, html):
&&&handle html page&&&
print(url)
def handle_response(self, url, response):
&&&inherit and rewrite this method&&&
if response.code == 200:
self.handle_html(url, response.body)
elif response.code == 599:
self._fetching.remove(url)
self._q.put(url)
@gen.coroutine
def get_page(self, url):
response = yield self.fetch(url)
print('######fetched %s' % url)
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return(e)
raise gen.Return(response)
@gen.coroutine
def _run(self):
@gen.coroutine
def fetch_url():
current_url = yield self._q.get()
if current_url in self._fetching:
print('fetching****** %s' % current_url)
self._fetching.add(current_url)
response = yield self.get_page(current_url)
self.handle_response(current_url, response)
# handle reponse
self._fetched.add(current_url)
for i in range(self.concurrency):
if self.urls:
yield self._q.put(self.urls.pop())
self._q.task_done()
@gen.coroutine
def worker():
while True:
yield fetch_url()
self._q.put(self.urls.pop())
# add first url
# Start workers, then wait for the work queue to be empty.
for _ in range(self.concurrency):
yield self._q.join(timeout=timedelta(seconds=300000))
assert self._fetching == self._fetched
def run(self):
io_loop = ioloop.IOLoop.current()
io_loop.run_sync(self._run)
class MySpider(AsySpider):
def fetch(self, url, **kwargs):
&&&重写父类fetch方法可以添加cookies，headers，timeout等信息&&&
cookies_str = &PHPSESSID=j1tt66a829idnms56ppb70jri4; pspt=%7B%22id%22%3A%%2C%22pswd%22%3A%cab016fbf9eC%22_code%22%3A%22f779dcd011f4ee1b%7D; key=%E9%87%8D%E5%BA%86%E5%95%84%E6%9C%A8%E9%B8%9F%E7%BD%91%E7%BB%9C%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8; think_language=zh- SERVERID=a66d7d08fa1c8b2e37dbdc6ffff82d9e||; CNZZDATA=--%7C&
# 从浏览器拷贝cookie字符串
headers = {
'User-Agent': 'mozilla/5.0 ( baiduspider/2.0; +/search/spider.html)',
'cookie': cookies_str
return super(MySpider, self).fetch(
# 参数参考tornado文档
url, headers=headers, request_timeout=1
def handle_html(self, url, html):
print(url, html)
def main():
for page in range(1, 100):
urls.append('?page=%s' % page)
s = MySpider(urls)
if __name__ == '__main__':
可以继承这个类，塞一些url进去，然后重写handle_page处理得到的页面。
异步＋多进程爬虫
还可以再变态点，加个进程池，使用了multiprocessing模块。效率飕飕的,
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import time
from multiprocessing import Pool
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues
class AsySpider(object):
&&&A simple class of asynchronous spider.&&&
def __init__(self, urls, concurrency):
urls.reverse()
self.urls = urls
self.concurrency = concurrency
self._q = queues.Queue()
self._fetching = set()
self._fetched = set()
def handle_page(self, url, html):
filename = url.rsplit('/', 1)[1]
with open(filename, 'w+') as f:
f.write(html)
@gen.coroutine
def get_page(self, url):
response = yield httpclient.AsyncHTTPClient().fetch(url)
print('######fetched %s' % url)
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return('')
raise gen.Return(response.body)
@gen.coroutine
def _run(self):
@gen.coroutine
def fetch_url():
current_url = yield self._q.get()
if current_url in self._fetching:
print('fetching****** %s' % current_url)
self._fetching.add(current_url)
html = yield self.get_page(current_url)
self._fetched.add(current_url)
self.handle_page(current_url, html)
for i in range(self.concurrency):
if self.urls:
yield self._q.put(self.urls.pop())
self._q.task_done()
@gen.coroutine
def worker():
while True:
yield fetch_url()
self._q.put(self.urls.pop())
# Start workers, then wait for the work queue to be empty.
for _ in range(self.concurrency):
yield self._q.join(timeout=timedelta(seconds=300000))
assert self._fetching == self._fetched
def run(self):
io_loop = ioloop.IOLoop.current()
io_loop.run_sync(self._run)
def run_spider(beg, end):
for page in range(beg, end):
urls.append('http://127.0.0.1/%s.htm' % page)
s = AsySpider(urls, 10)
def main():
_st = time.time()
p = Pool()
all_num = 73000
# number of cpu cores
per_num, left = divmod(all_num, num)
s = range(0, all_num, per_num)
for i in range(len(s)-1):
res.append((s[i], s[i+1]))
res.append((s[len(s)-1], all_num))
for i in res:
p.apply_async(run_spider, args=(i[0], i[1],))
print time.time()-_st
if __name__ == '__main__':
多线程爬虫
线程池实现.
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import Queue
import sys
import requests
import threading
import time
class Worker(threading.Thread):
# 处理工作请求
def __init__(self, workQueue, resultQueue, **kwds):
threading.Thread.__init__(self, **kwds)
self.setDaemon(True)
self.workQueue = workQueue
self.resultQueue = resultQueue
def run(self):
callable, args, kwds = self.workQueue.get(False)
# get task
res = callable(*args, **kwds)
self.resultQueue.put(res)
# put result
except Queue.Empty:
class WorkManager:
# 线程池管理,创建
def __init__(self, num_of_workers=10):
self.workQueue = Queue.Queue()
# 请求队列
self.resultQueue = Queue.Queue()
# 输出结果的队列
self.workers = []
self._recruitThreads(num_of_workers)
def _recruitThreads(self, num_of_workers):
for i in range(num_of_workers):
worker = Worker(self.workQueue, self.resultQueue)
# 创建工作线程
self.workers.append(worker)
# 加入到线程队列
def start(self):
for w in self.workers:
def wait_for_complete(self):
while len(self.workers):
worker = self.workers.pop()
# 从池中取出一个线程处理请求
worker.join()
if worker.isAlive() and not self.workQueue.empty():
self.workers.append(worker)
# 重新加入线程池中
print 'All jobs were complete.'
def add_job(self, callable, *args, **kwds):
self.workQueue.put((callable, args, kwds))
# 向工作队列中加入请求
def get_result(self, *args, **kwds):
return self.resultQueue.get(*args, **kwds)
def download_file(url):
#print 'beg download', url
requests.get(url).text
def main():
num_of_threads = int(sys.argv[1])
num_of_threads = 10
_st = time.time()
wm = WorkManager(num_of_threads)
print num_of_threads
urls = [''] * 1000
for i in urls:
wm.add_job(download_file, i)
wm.start()
wm.wait_for_complete()
print time.time() - _st
if __name__ == '__main__':
这三种里面的随便一种都有非常高的效率，不过这么跑的话会给网站服务器增加较大的压力，特别是一些小站点，还是有点节操一点的好。
大家还看了：
[访问统计：]
上一篇：下一篇：}

我就爱股票网