login每天学习一点点,每天进步一点点.
当前位置:首页 >> pd87博客进行python爬虫测试 Ver:1.1

pd87博客进行python爬虫测试 Ver:1.1

2015-12-29 22:55:09  |  分类: Python |  标签: 阅读(296)评论(0)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : c32 (amd5@qq.com)
# @Blog    : http://cx7863.blog.163.com/
# @Version : 1.1
# @DateTime:  2015-12-29 22:53:00

import urllib2
import re
import os
import MySQLdb
import threading 
import time

htmlPageList = []
#已多少翻页
htmlLinkList = []
#已抓取地址数量
mysqlinsert = []
#mysql插入数量

starttime = time.time()

conn = MySQLdb.connect(host='localhost', user='root',
                    passwd='root', db='python', port = 3306, charset = 'utf8')

def getHtml(url):   #获取html源码
    page = urllib2.urlopen(url)
    html = page.read()
    return html
 
def urlPages(page):     #翻页
    url = 'http://bk.pd87.com/page/' + str(page)
    #print url
    return url
def findList(html):     #正则匹配列表
    myItems = re.findall('<h2><a href="(.*?)" rel="bookmark" title="详细阅读 (.*?)">(.*?)</a><span class="new"></span></h2>', html, re.S)
    return myItems
 
for page in range(1, 2+1):    #抓取的页数
    html = getHtml(urlPages(page))
    items = findList(html)
    for item in items:
        s = item[0] +' '+ item[1] + '\n'
        # print item[0]
        file_object = open('list.txt', 'a')
        # file_object.write(s)    #写到本地文本
        file_object.close()
        print item[1]
        # print html              #读出源代码
        # print item              #匹配正则后的结果
    else:
        print('循环爬取结束>>>>>>>>>>>>>>>')
    cur = conn.cursor()
    cur.execute('insert into list(url,title) values(%s,%s)',(item[0],item[1]))
    conn.commit()  #提交SQL执行语句
# print 'success connect'
# print 'sqltemp'
class MyThread(threading.Thread):
    def __init__(self,threadname):
        threading.Thread.__init__(self,name=threadname)


cur.close()
conn.close()
endtime = time.time()
print "共用时%s秒" % (endtime - starttime)
上一篇:ubuntu使用nmap查询端口 下一篇:ThinkPHP5 的视图$view->fetch()和$view->display()的区别

猜你喜欢

发表评论:

0.122866s