现在的位置: 首页 > 编程语言 > 正文

python分布式抓取网页

2019年03月14日 编程语言 ⁄ 共 19751字 ⁄ 字号 评论关闭

秒速赛车公式 www.l19l7.cn 呵呵,前两节好像和python没多大关系。。这节完全是贴代码,

?

这是我第一次写python,很多地方比较乱,主要就看看逻辑流程吧。

?

对于编码格式确实搞得我头大。。取下来页面不知道是什么编码,所以先找charset,然后转unicode。统一在unicode下操作,但是数据库是utf8的,WINDOWS的控制台又必须是gbk的,但是我IDE控制台必须是utf8的。。所以才会有DEBUG这个变量存在。。。主要是为了控制输出编码。

?

本程序连跑了24小时,然后分布式在10台机器上部署,长时间续航基本没有问题。

之后每天将进行10万次网页的爬取。

?

源码如下:

?

?

内容爬取及工具

?

  1. '''''?
  2. Created?on?2010-9-15?
  3. ?
  4. @author:?chenggong?
  5. '''??
  6. ??
  7. import?urllib2??
  8. import?re??
  9. import?socket??
  10. ??
  11. ??
  12. DEBUG?=?0??
  13. ??
  14. '''''?
  15. 工具类?
  16. '''??
  17. class?Tools():??
  18. ????#log函数??
  19. [email protected]???
  20. ????def?writelog(level,info,notify=False):??
  21. ????????if?DEBUG?==?0:??
  22. ????????????try:??
  23. ????????????????print?"["+level+"]"+info.decode('UTF-8').encode('GBK')???
  24. ????????????except:??
  25. ????????????????print?"["+level+"]"+info.encode('GBK')???
  26. ????????else:??
  27. ????????????print?"["+level+"]"+info??
  28. ????????#if?notify:??
  29. ????????#????print?"[notify]报告管理员!!"??
  30. ??????????
  31. ????#转unicode??
  32. [email protected]???
  33. ????def?toUnicode(s,charset):??
  34. ????????if(?charset?==?""?):??
  35. ????????????return?s??
  36. ????????else:??
  37. ????????????try:??
  38. ????????????????u?=?unicode(?s,?charset?)??
  39. ????????????except:??
  40. ????????????????u?=?""??
  41. ????????return?u???
  42. ??????
  43. ????#正则抓取??
  44. ????#@param?single?是否只抓取一个??
  45. [email protected]???
  46. ????def?getFromPatten(patten,src,single=False):??
  47. ????????rst?=?"";??
  48. ????????p?=?re.compile(patten,re.S)??
  49. ????????all?=?p.findall(src)??
  50. ????????for?matcher?in?all:??
  51. ????????????rst?+=?matcher?+?"?"??
  52. ????????????if(?single?):??
  53. ????????????????break??
  54. ????????return?rst.strip()??
  55. ??
  56. '''''?
  57. 网页内容爬虫?
  58. '''??
  59. class?PageGripper():??
  60. ????URL_OPEN_TIMEOUT?=?10?#网页超时时间??
  61. ????MAX_RETRY?=?3?#最大重试次数??
  62. ??????
  63. ????def?__init__(self):??
  64. ????????socket.setdefaulttimeout(self.URL_OPEN_TIMEOUT)??
  65. ??????
  66. ????#获取字符集??
  67. ????def?getCharset(self,s):??
  68. ????????rst?=?Tools.getFromPatten(u'charset=(.*?)"',s,True)??
  69. ????????if?rst?!=?"":??
  70. ????????????if?rst?==?"utf8":??
  71. ????????????????rst?=?"utf-8"??
  72. ????????return?rst??
  73. ??????
  74. ????#尝试获取页面??
  75. ????def?downloadUrl(self,url):??
  76. ????????charset?=?""??
  77. ????????page?=?""??
  78. ????????retry?=?0??
  79. ????????while?True:??
  80. ????????????try:??
  81. ????????????????fp?=?urllib2.urlopen(url)??
  82. ????????????????break??
  83. ????????????except?urllib2.HTTPError,e:?#状态错误??
  84. ????????????????Tools.writelog('error','HTTP状态错误?code='+e.code)??
  85. ????????????????raise?urllib2.HTTPError??
  86. ????????????except?urllib2.URLError,e:?#网络错误超时??
  87. ????????????????Tools.writelog('warn','页面访问超时,重试..')??
  88. ????????????????retry+=1??
  89. ????????????????if(?retry?>?self.MAX_RETRY?):??
  90. ????????????????????Tools.writelog('warn','超过最大重试次数,放弃')??
  91. ????????????????????raise?urllib2.URLError??
  92. ??????
  93. ????????while?True:??
  94. ????????????line?=?fp.readline()??
  95. ????????????if?charset?==?"":??
  96. ????????????????charset?=?self.getCharset(line)??
  97. ????????????if?not?line:??
  98. ????????????????break??
  99. ????????????page?+=?Tools.toUnicode(line,charset)??
  100. ????????fp.close()??
  101. ????????return?page??
  102. ??????
  103. ????#获取页面??
  104. ????def?getPageInfo(self,url):??
  105. ????????Tools.writelog(?"info","开始抓取网页,url=?"+url)??
  106. ????????info?=?""??
  107. ????????try:??
  108. ????????????info?=?self.downloadUrl(url)??
  109. ????????except:??
  110. ????????????raise??????????
  111. ????????Tools.writelog("debug","网页抓取成功")??
  112. ????????return?info??
  113. ??
  114. '''''?
  115. 内容提取类?
  116. '''??
  117. class?InfoGripper():??
  118. ????pageGripper?=?PageGripper()??
  119. ??????
  120. ????def?__init__(self):??
  121. ????????Tools.writelog('debug',"爬虫启动")??
  122. ????
  123. ????#抓取标题??
  124. ????def?griptitle(self,data):??
  125. ????????title?=?Tools.getFromPatten(u'box2t?sp"><h3>(.*?)</h3>',?data,?True)??
  126. ????????if?title?==?"":??
  127. ????????????title?=?Tools.getFromPatten(u'<title>(.*?)[-<]',data,True)??
  128. ????????return?title.strip()??
  129. ??????
  130. ????#抓取频道??
  131. ????def?gripchannel(self,data):??
  132. ????????zone?=?Tools.getFromPatten(u'频道:(.*?)</span>',data,True)??
  133. ????????channel?=?Tools.getFromPatten(u'<a.*?>(.*?)</a>',zone,True)??
  134. ????????return?channel??
  135. ??????
  136. ????#抓取标签??
  137. ????def?griptag(self,data):??
  138. ????????zone?=?Tools.getFromPatten(u'标签:(.*?)</[^a].*>',data,True);??
  139. ????????rst?=?Tools.getFromPatten(u'>(.*?)</a>',zone,False);??
  140. ????????return?rst??
  141. ??????
  142. ????#抓取观看次数??
  143. ????def?gripviews(self,data):??
  144. ????????rst?=?Tools.getFromPatten(u'已经有<em?class="hot"?id="viewcount">(.*?)</em>次观看',data);??
  145. ????????return?rst??
  146. ??????
  147. ????#抓取发布时间??
  148. ????def?griptime(self,data):??
  149. ????????rst?=?Tools.getFromPatten(u'在<em>(.*?)</em>发布',data,True)??
  150. ????????return?rst??
  151. ??????
  152. ????#抓取发布者??
  153. ????def?gripuser(self,data):??
  154. ????????rst?=?Tools.getFromPatten(u'title="点击进入(.*?)的用户空间"',data,True)??
  155. ????????return?rst??
  156. ??????
  157. ????#获取页面字符集??
  158. ????def?getPageCharset(self,data):??
  159. ????????charset?=?Tools.getFromPatten(u'charset=(.*?)"',data,True)??
  160. ??????????
  161. ????????if(?charset?==?"utf8"?):??
  162. ????????????charset?=?"utf-8"??
  163. ????????return?charset??
  164. ??????
  165. ????#获取CC相关数据??
  166. ????def?getCCData(self,data):??
  167. ??????????
  168. ????????zone?=?Tools.getFromPatten(u'SWFObject(.*?)</script>',data,True)??
  169. ??????????
  170. ????????#判断是否使用bokecc播放??
  171. ????????isFromBokeCC?=?re.match('.*bokecc.com.*',?zone)??
  172. ????????if(?not?isFromBokeCC?):??
  173. ????????????return?"",""??
  174. ??????????????
  175. ????????ccSiteId?=?Tools.getFromPatten(u'siteid=(.*?)[&,"]',zone,True)??
  176. ????????ccVid?=?Tools.getFromPatten(u'vid=(.*?)[&,"]',zone,True)??
  177. ????????return?ccSiteId,ccVid??
  178. ??????
  179. ????#获取站内vid??
  180. ????def?gripVideoId(self,data):??
  181. ????????vid?=?Tools.getFromPatten(u'var?vid?=?"(.*?)"',data,True)??
  182. ????????return?vid??
  183. ??????
  184. ????#获取点击量??
  185. ????def?gripViewsAjax(self,vid,url,basedir):??
  186. ????????host?=?Tools.getFromPatten(u'//(.*?)/',url,True)??
  187. ????????ajaxAddr?=?"//"?+?host?+?basedir?+?"/index.php/ajax/video_statistic/"?+?vid??
  188. ????????'''''?
  189. ????????try:?
  190. ????????????content?=?self.pageGripper.getPageInfo(ajaxAddr)?
  191. ????????except?Exception,e:?
  192. ????????????print?e?
  193. ????????????Tools.writelog?("error",?ajaxAddr+u"抓取失败")?
  194. ????????????return?"error"?
  195. ????????'''??
  196. ????????Tools.writelog('debug',?u"开始获取点击量,url="+ajaxAddr)??
  197. ????????while?True:??
  198. ????????????try:??
  199. ????????????????fp?=?urllib2.urlopen(ajaxAddr)??
  200. ????????????????break??
  201. ????????????except?urllib2.HTTPError,e:?#状态错误??
  202. ????????????????Tools.writelog('error','HTTP状态错误?code='+"%d"%e.code)??
  203. ????????????????return?""??
  204. ????????????except?urllib2.URLError,e:?#网络错误超时??
  205. ????????????????Tools.writelog('warn','页面访问超时,重试..')??
  206. ????????????????retry+=1??
  207. ????????????????if(?retry?>?self.MAX_RETRY?):??
  208. ????????????????????Tools.writelog('warn','超过最大重试次数,放弃')??
  209. ????????????????????return?""??
  210. ????????content?=?fp.read()??
  211. ????????fp.close()??
  212. ????????views?=?Tools.getFromPatten(u'"viewcount":(.*?),',content,True)??
  213. ????????views?=?views.replace('"','')??
  214. ????????return?views??
  215. ??????
  216. ????#从网页内容中爬取点击量???
  217. ????def?gripViewsFromData(self,data):??
  218. ????????views?=?Tools.getFromPatten(u'已经有<.*?>(.*?)<.*?>次观看',data,True)??
  219. ????????return?views??
  220. ??
  221. ????def?gripBaseDir(self,data):??
  222. ????????dir?=?Tools.getFromPatten(u"base_dir?=?'(.*?)'",data,True)??
  223. ????????return?dir??
  224. ??
  225. ????#抓取数据??
  226. ????def?gripinfo(self,url):???
  227. ??????????
  228. ????????try:??
  229. ????????????data?=?self.pageGripper.getPageInfo(url)??
  230. ????????except:??
  231. ????????????Tools.writelog?("error",?url+"?抓取失败")??
  232. ????????????raise??
  233. ??????????
  234. ????????Tools.writelog('info','开始内容匹配')??
  235. ????????rst?=?{}??
  236. ????????rst['title']?=?self.griptitle(data)??
  237. ????????rst['channel']?=?self.gripchannel(data)??
  238. ????????rst['tag']?=?self.griptag(data)??
  239. ????????rst['release']?=?self.griptime(data)??
  240. ????????rst['user']?=?self.gripuser(data)??
  241. ????????ccdata?=?self.getCCData(data)??
  242. ????????rst['ccsiteId']?=?ccdata[0]??
  243. ????????rst['ccVid']?=?ccdata[1]??
  244. ????????views?=?self.gripViewsFromData(data)??
  245. ????????if?views?==""?or?not?views:??
  246. ????????????vid?=?self.gripVideoId(data)??
  247. ????????????basedir?=?self.gripBaseDir(data)??
  248. ????????????views?=?self.gripViewsAjax(vid,url,basedir)??
  249. ????????????if(?views?==?""?):??
  250. ????????????????views?=?"error"??
  251. ????????????if(?views?==?"error"):??
  252. ????????????????Tools.writelog("error","获取观看次数失败")??
  253. ????????Tools.writelog("debug","点击量:"+views)??
  254. ????????rst['views']?=?views??
  255. ????????Tools.writelog('debug','title=%s,channel=%s,tag=%s'%(rst['title'],rst['channel'],rst['tag']))??
  256. ????????return?rst??
  257. ??
  258. '''''?
  259. 单元测试?
  260. '''??
  261. if?__name__?==?'__main__':??
  262. ????list?=?[??
  263. ????????????'//008yx.com/xbsp/index.php/video/index/3138',??
  264. ????????????'//vblog.xwhb.com/index.php/video/index/4067',??
  265. ????????????'//demo.ccvms.bokecc.com/index.php/video/index/3968',??
  266. ????????????'//vlog.cnhubei.com/wuhan/20100912_56145.html',??
  267. ????????????'//vlog.cnhubei.com/html/js/30271.html',??
  268. ????????????'//www.ddvtv.com/index.php/video/index/15',??
  269. ????????????'//boke.2500sz.com/index.php/video/index/60605',??
  270. ????????????'//video.zgkqw.com/index.php/video/index/334',??
  271. ????????????'//yule.hitmv.com/html/joke/27041.html',??
  272. ????????????'//www.ddvtv.com/index.php/video/index/11',??
  273. ????????????'//www.zgnyyy.com/index.php/video/index/700',??
  274. ????????????'//www.kdianshi.com/index.php/video/index/5330',??
  275. ????????????'//www.aoyatv.com/index.php/video/index/127',??
  276. ????????????'//v.ourracing.com/html/channel2/64.html',??
  277. ????????????'//v.zheye.net/index.php/video/index/93',??
  278. ????????????'//vblog.thmz.com/index.php/video/index/7616',??
  279. ????????????'//kdianshi.com/index.php/video/index/5330',??
  280. ????????????'//tv.seeyoueveryday.com/index.php/video/index/95146',??
  281. ????????????'//sp.zgyangzhi.com/html/ji/2.html',??
  282. ????????????'//www.xjapan.cc/index.php/video/index/146',??
  283. ????????????'//www.jojy.cn/vod/index.php/video/index/399',??
  284. ????????????'//v.cyzone.cn/index.php/video/index/99',??
  285. ????????????]??
  286. ??????
  287. ????list1?=?['//192.168.25.7:8079/vinfoant/versionasdfdf']??
  288. ??
  289. ????infoGripper?=?InfoGripper()??
  290. ????for?url?in?list:??
  291. ????????infoGripper.gripinfo(url)??
  292. ????del?infoGripper??

?

?

WEB服务及任务调度

?

  1. '''''?
  2. Created?on?2010-9-15?
  3. ?
  4. @author:?chenggong?
  5. '''??
  6. #?-*-?coding:?utf-8?-*-??
  7. import?string,cgi,time??
  8. from?os?import?curdir,sep??
  9. from?BaseHTTPServer?import?BaseHTTPRequestHandler,HTTPServer??
  10. from?InfoGripper?import?*??
  11. import?re??
  12. import?MySQLdb??
  13. import?time??
  14. import?threading??
  15. import?urllib??
  16. import?urllib2??
  17. ??
  18. PORT?=?8079??
  19. VERSION?=?0.1??
  20. DBCHARSET?=?"utf8"??
  21. PARAMS?=?[??
  22. ??????????'callback',??
  23. ??????????'sessionId',??
  24. ??????????'retry',??
  25. ??????????'retryInterval',??
  26. ??????????'dbhost',??
  27. ??????????'dbport',??
  28. ??????????'db',??
  29. ??????????'dbuser',??
  30. ??????????'dbpass',??
  31. ??????????'videoId'??
  32. ??????????]??
  33. ??
  34. DBMAP?=?['video_id',??
  35. ?????????'ccsiteid',??
  36. ?????????'ccvid',??
  37. ?????????'desc_url',??
  38. ?????????'site_id',??
  39. ?????????'title',??
  40. ?????????'post_time',??
  41. ?????????'author',??
  42. ?????????'elapse',??
  43. ?????????'channel',??
  44. ?????????'tags',??
  45. ?????????'create_time',??
  46. ?????????'check_time',??
  47. ?????????'status']??
  48. ??
  49. '''''?
  50. ERROR?CODE定义?
  51. '''??
  52. ERR_OK?=?0??
  53. ERR_PARAM?=?1??
  54. ERR_HTTP_TIMEOUT?=?5??
  55. ERR_HTTP_STATUS?=?6??
  56. ERR_DB_CONNECT_FAIL?=?8??
  57. ERR_DB_SQL_FAIL?=?9??
  58. ERR_GRIPVIEW?=?11??
  59. ERR_UNKNOW?=?12??
  60. ??
  61. '''''?
  62. 数据库适配器?
  63. '''??
  64. class?DBAdapter(object):??
  65. ??????
  66. ????def?__init__(self):??
  67. ????????self.param?=?{'ip':'',??
  68. ??????????????????????'port':0,??
  69. ??????????????????????'user':'',??
  70. ??????????????????????'pw':'',??
  71. ??????????????????????'db':''}??
  72. ????????self.connect_once?=?False??#是否连接过数据库??
  73. ??????
  74. ????'''''?
  75. ????????????创建/更新数据库连接池?
  76. ????'''??
  77. ????def?connect(self,ip,port,user,pw,db):??
  78. ????????if(?ip?!=?self.param['ip']?or??
  79. ????????????port?!=?self.param['port']?or??
  80. ????????????user?!=?self.param['user']?or??
  81. ????????????pw?!=?self.param['pw']?or??
  82. ????????????db?!=?self.param['db']):??
  83. ????????????Tools.writelog('info','更换数据库连接池,ip='+ip+',port='+port+',user='+user+',pw='+pw+',db='+db)??
  84. ????????????try:??
  85. ????????????????if?self.connect_once?==?True:?#释放上次连接??
  86. ????????????????????self.cur.close()??
  87. ????????????????????self.conn.close()??
  88. ????????????????self.conn=MySQLdb.connect(user=user,passwd=pw,db=db,host=ip,port=int(port))??
  89. ????????????????self.conn.set_character_set(DBCHARSET)??
  90. ????????????????self.connect_once?=?True??
  91. ????????????????self.cur=self.conn.cursor(MySQLdb.cursors.Cursor)??
  92. ????????????????self.param['ip']?=?ip??
  93. ????????????????self.param['port']?=?port??
  94. ????????????????self.param['user']?=?user??
  95. ????????????????self.param['pw']?=?pw??
  96. ????????????????self.param['db']?=?db??
  97. ????????????except:??
  98. ????????????????Tools.writelog('error',u'数据库连接失败',True)???
  99. ????????????????raise??
  100. ????????????else:??
  101. ????????????????Tools.writelog('info',u'数据库连接成功')??
  102. ??????
  103. ????'''''?
  104. ????????????执行SQL语句?
  105. ????'''??
  106. ????def?execute(self,sql):??
  107. ????????Tools.writelog('debug',u'执行SQL:?'+sql)??
  108. ????????try:??
  109. ????????????self.cur.execute(sql)??
  110. ????????except:??
  111. ????????????Tools.writelog('error',u'SQL执行错误:'+sql)??
  112. ????????????raise??
  113. ??????????????????
  114. ????'''''?
  115. ????????????查询数据库?
  116. ????'''??
  117. ????def?query(self,sql):??
  118. ????????row?=?{}??
  119. ????????self.execute(sql)??
  120. ????????row=self.cur.fetchall()??
  121. ????????return?row??
  122. ??????
  123. ????'''''?
  124. ????????????视频错误?
  125. ????'''??
  126. ????def?updateErr(self,videoId):??
  127. ????????nowtime?=?time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time()))??
  128. ????????sql?=?"UPDATE?videos?SET?"??
  129. ????????sql?+=?"check_time='"?+?nowtime?+"',"??
  130. ????????sql?+=?"status=-1?"??
  131. ????????sql?+=?"WHERE?video_id="+videoId??
  132. ????????self.execute(sql)??
  133. ????????self.conn.commit()???
  134. ??????????
  135. ????'''''?
  136. ????????????更新查询结果?
  137. ????'''??
  138. ????def?update(self,obj,videoId,isUpdateTitle=True):??
  139. ??????????
  140. ????????Tools.writelog('debug','开始更新数据库')??
  141. ????????try:??
  142. ????????????#更新video表??
  143. ????????????sql?=?"UPDATE?videos?SET?"??
  144. ????????????if(obj['ccsiteId']?!=""?):??
  145. ????????????????sql?+=?"ccsiteid='"?+?obj['ccsiteId']?+?"',"??
  146. ????????????if(obj['ccVid']?!=?""?):??
  147. ????????????????sql?+=?"ccvid='"?+?obj['ccVid']?+?"',"??
  148. ????????????if?isUpdateTitle:??
  149. ????????????????sql?+=?"title='"?+?obj['title']?+?"',"??
  150. ????????????sql?+=?"post_time='"?+?obj['release']?+?"',"??
  151. ????????????sql?+=?"author='"?+?obj['user']?+?"',"??
  152. ????????????sql?+=?"channel='"?+?obj['channel']?+?"',"??
  153. ????????????sql?+=?"tags='"?+?obj['tag']?+?"',"??
  154. ????????????nowtime?=?time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time()))??
  155. ????????????sql?+=?"check_time='"?+?nowtime?+"',"??
  156. ????????????sql?+=?"status=0?"??
  157. ????????????sql?+=?"WHERE?video_id="+videoId??
  158. ??????????????
  159. ????????????self.execute(sql)??
  160. ??????
  161. ????????????#更新count表??
  162. ????????????if(?obj['views']?!=?'error'?):??
  163. ????????????????nowdate?=?time.strftime('%Y-%m-%d',time.localtime(time.time()))??
  164. ????????????????sql?=?"SELECT?*?FROM?counts?WHERE?"??
  165. ????????????????sql?+=?"date?=?'"?+?nowdate?+?"'?and?video_id="?+?videoId??
  166. ????????????????rst?=?self.query(sql)??
  167. ????????????????if?len(rst)?>?0:#如果当天已有记录,则更新??
  168. ????????????????????sql?=?"UPDATE?counts?SET?count="+obj['views']??
  169. ????????????????????sql?+="?WHERE?video_id="?+?videoId?+?"?AND?date='"?+?nowdate+?"'"??
  170. ????????????????else:#否则插入??
  171. ????????????????????sql?=?"INSERT?INTO?counts?VALUES"??
  172. ????????????????????sql?+=?"(null,"?+videoId+",'"+nowdate+"',"+obj['views']?+?")"??
  173. ????????????self.execute(sql)?????????????????
  174. ????????????self.conn.commit()???????
  175. ????????????Tools.writelog('debug',?"db?commit?ok")??
  176. ????????????return?ERR_OK??
  177. ????????except?Exception,e:??
  178. ????????????print?e??
  179. ????????????return?ERR_DB_SQL_FAIL??
  180. ??
  181. '''''?
  182. 任务线程类?
  183. '''??
  184. class?TaskThread(threading.Thread):??
  185. ??????
  186. ????def?setTaskTool(self,dbAdapter,gripper):??
  187. ????????self.dbAdapter?=?dbAdapter??
  188. ????????self.gripper?=?gripper??
  189. ??????
  190. ????def?setParam(self,param):??
  191. ????????self.param?=?param??
  192. ????????self.videoId?=?param['videoId']??
  193. ????????assert?self.videoId?!=?""??
  194. ??????
  195. ????def?init(self):??
  196. ????????self.views?=?"0"??
  197. ????????self.errcode?=?ERR_OK??
  198. ??????
  199. ????def?run(self):??
  200. ????????Tools.writelog('debug','开始爬虫任务,sessionId='+self.param['sessionId'])??
  201. ????????self.init()??
  202. ????????try:??
  203. ????????????#更新数据库连接??
  204. ????????????self.dbAdapter.connect(self.param['dbhost'],self.param['dbport'],self.param['dbuser'],self.param['dbpass'],self.param['db'])??
  205. ????????except:??
  206. ????????????self.errcode?=?ERR_DB_CONNECT_FAIL?#数据库连接失败??
  207. ????????????callback(self.errcode)??
  208. ????????????return??
  209. ??????????
  210. ????????#查询该vid的视频??
  211. ????????sql?=?"SELECT?"??
  212. ????????for?column?in?DBMAP:??
  213. ????????????sql?+=?column??
  214. ????????????if?column?!=?DBMAP[len(DBMAP)-1]:??
  215. ????????????????sql?+=?","??
  216. ??
  217. ????????sql?+=?"?FROM?videos"??
  218. ????????sql?+=?"?WHERE?video_id="+self.videoId??
  219. ????????video?=?self.dbAdapter.query(sql)??
  220. ????????assert?not?(len(video)>1?or?len(video)==0)?#有且仅有一条记录??
  221. ??????????
  222. ????????url?=?video[0][3]??
  223. ????????assert?url?!=?""???
  224. ????????try:??
  225. ????????????rst?=?self.gripper.gripinfo(url)??
  226. ????????except?urllib2.HTTPError,e:??
  227. ????????????self.errcode?=?ERR_HTTP_STATUS?#HTTP状态错误??
  228. ????????????self.dbAdapter.updateErr(self.videoId)??
  229. ????????except?urllib2.URLError,e:??
  230. ????????????self.errcode?=?ERR_HTTP_TIMEOUT?#HTTP连接超时??
  231. ????????????self.dbAdapter.updateErr(self.videoId)??
  232. ????????except:??
  233. ????????????self.errcode?=?ERR_UNKNOW?#未知错误??
  234. ????????????self.dbAdapter.updateErr(self.videoId)??
  235. ????????else:??
  236. ????????????self.views?=?rst['views']??
  237. ????????????if?self.views?==?"error":??
  238. ????????????????self.views?=?"-1"??
  239. ????????????????self.errcode?=?ERR_GRIPVIEW?#数据抓取成功,点击量抓取失败??
  240. ????????????#更新数据库(特殊处理,如果原title中有?"-"?则不更新title字段)??
  241. ????????????title?=?video[0][5]??
  242. ????????????assert?title?!=?""??
  243. ????????????if?re.match('.*-.*',?title):??
  244. ????????????????self.errocde?=?self.dbAdapter.update(rst,self.videoId,True)??
  245. ????????????else:??
  246. ????????????????self.errcode?=?self.dbAdapter.update(rst,self.videoId)??
  247. ????????self.callback(self.errcode)??
  248. ????????Tools.writelog('info','任务结束,sessionId='+self.param['sessionId'])??
  249. ????????return??
  250. ??????
  251. ????def?callback(self,errcode):?????
  252. ????????results?=?{'errorcode':errcode,'count':int(self.views)}??
  253. ????????results?=?urllib.urlencode(results)??
  254. ????????results?=?results.replace('&',?'%26')??
  255. ????????url?=?self.param['callback']??
  256. ????????url?+=?"?"??
  257. ????????url?+=?"sessionId="?+?self.param['sessionId']??
  258. ????????url?+=?"&results="?+?results??
  259. ????????retry?=?0??
  260. ????????while?True:??
  261. ????????????try:??
  262. ????????????????Tools.writelog('debug',"回调主控,url="+url)??
  263. ????????????????urllib2.urlopen(url)??
  264. ????????????????Tools.writelog('debug','回调成功')??
  265. ????????????????break??
  266. ????????????except?urllib2.URLError,?e:?#超时、错误??
  267. ????????????????Tools.writelog('debug','回调主控超时,%s秒后重试'%self.param['retryInterval'])??
  268. ????????????????retry+=1??
  269. ????????????????time.sleep(int(self.param['retryInterval']))??
  270. ????????????????if(?retry?>?int(self.param['retry'])):??
  271. ????????????????????Tools.writelog('error','回调主控失败')??
  272. ????????????????????return???
  273. ??
  274. '''''?
  275. WEB服务类?
  276. '''??
  277. class?MyHandler(BaseHTTPRequestHandler):??
  278. ??????
  279. ????dbAdapter?=?DBAdapter()??
  280. ????gripper?=?InfoGripper()??
  281. ??????
  282. ????def?pageSuccess(self):??
  283. ????????self.send_response(200)??
  284. ????????self.send_header('Content-type',?'text/html')??
  285. ????????self.end_headers()??
  286. ??????????
  287. ????def?pageFail(self):??
  288. ????????self.send_error(404,?"not?found")??
  289. ??????
  290. ????def?getValue(self,param):??
  291. ????????src?=?self.path?+?'&'??
  292. ????????reg?=?param?+?'='?+?'(.*?)&'??
  293. ??????????
  294. ????????value?=?Tools.getFromPatten(reg,src,True)??
  295. ????????return?value??
  296. ??????
  297. ????def?do_GET(self):??
  298. ????????isGetVersion?=?re.match('.*vinfoant/version.*',?self.path)??
  299. ????????isTask?=?re.match('.*vinfoant/run.*',?self.path)??
  300. ????????if(?isGetVersion?):??
  301. ????????????self.pageSuccess()??
  302. ????????????self.wfile.write(VERSION)??
  303. ????????elif(?isTask?):??
  304. ????????????self.pageSuccess()??
  305. ????????????param?=?{}??
  306. ????????????for?p?in?PARAMS:??
  307. ????????????????param[p]?=?self.getValue(p)?#获取各项参数??
  308. ????????????taskThread?=?TaskThread()??
  309. ????????????taskThread.setTaskTool(self.dbAdapter,?self.gripper)??
  310. ????????????taskThread.setParam(param)??
  311. ????????????taskThread.start()#启动任务线程??
  312. ????????????self.wfile.write("ok")??????????
  313. ????????else:??
  314. ????????????self.pageFail()??
  315. ????????return??
  316. ??????
  317. '''''?
  318. 启动WEB服务,全局入口?
  319. '''??
  320. def?startHttpd():??
  321. ????try:??
  322. ????????Tools.writelog('debug','httpd?start..listen?on?'+str(PORT))??
  323. ????????httpd?=?HTTPServer(('',PORT),?MyHandler?)??
  324. ????????Tools.writelog('debug','success')??
  325. ????????httpd.serve_forever()??
  326. ????except?KeyboardInterrupt:??
  327. ????????Tools.writelog('debug','httpd?close..')??
  328. ????????httpd.socket.close()??
  329. ??????????
  330. if?__name__?==?'__main__':??
  331. ????startHttpd()?

【上篇】
【下篇】

抱歉!评论已关闭.

  • 一以贯之推进党的建设新的伟大工程 2019-03-19
  • 回复@真理论者:你天天在强坛攻击爱因斯坦是不是劳动?创造价值么?负价值也! 2019-03-19
  • 北京天安门广场更换花卉 2019-03-18
  • 党的自我革命是伟大社会革命的强大动力(深入学习贯彻习近平新时代中国特色社会主义思想) 2019-03-18
  • 人民日报人民时评:让安全生产理念成为基本共识 2019-03-18
  • “人民体育 健康中国”马拉松系列赛北京站 2019-03-17
  • 识破“假大学”并没那么难 2019-03-17
  • 佛山:用公积金买装配式住房 贷款额度或可上浮20% ——凤凰网房产北京 2019-03-17
  • 奥运冠军寄语Running Together国际迷你马拉松—在线播放—《奥运冠军寄语Running Together国际迷你马拉松》—体育—优酷网,视频高清在线观看 2019-03-17
  • 【理上网来喜迎十九大】西班牙学者:大国外交令中国成为建立世界新秩序的中流砥柱 2019-03-16
  • 马上背!十九大报告中的四个“新” 2019-03-16
  • 呼市赛罕区南门外小学开展庆父亲节亲子趣味足球赛 2019-03-16
  • 2017大皖客户端徽派栏目全面回顾宣传片 2019-03-15
  • 回复@海之宁:你想自主劳动?全民所有的生产资料凭啥让你自主? 2019-03-15
  • 重庆市南岸区:探索建立“微益坊” 2019-03-15
  • 765| 679| 419| 743| 507| 689| 321| 744| 283| 130|