Python3爬取小说章节

网络小说《侯大利刑侦笔记》比较吸引人，在手机上的微信读书App看得不过瘾。于是找到了一个小说论坛，可以顺畅地看小说了，但是一页一页地点击论坛网页，也有些费劲，还需要看一些无聊的广告栏。能否用Python爬虫程序一次性下载所有小说页面呢？

心里有了这个念头，说干就干，索性参考网友的Python程序，自己修改一番，就能一股脑儿地把论坛里的小说页面都下载下来了。合并生成一个txt文件，放入Kindle电子书里面，看起来比电脑屏幕对眼睛友好一点。比较不伤眼。

代码如下：

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
#coding=utf-8
import urllib
import re
import os
import glob
import urllib.request


from urllib.parse import urljoin
from urllib.parse import urlparse

def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext


def fetch_section(webroot):
    for page in range(8,9):
        print('正在下载第'+str(page)+'页小说')

        url = 'http://www.cangshubao.net/forum-915-'+str(page)+'.html'
        headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  }
        try:
            request = urllib.request.Request(url,headers=headers)
            response = urllib.request.urlopen(request,timeout=180)
            #print (response.read().decode('gbk'))
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)

        # html = response.read().decode('utf-8')
        html = response.read().decode('gbk')
        #print html
        # pattern = re.compile(u'<li>.*?<div class="s">.*?target="_blank">(.*?)</a><br />大小：(.*?)<br>.*?</em><br>更新：(.*?)</div>.*?<a href="(.*?)"><img.*?>(.*?)</a>.*?<div class="u">(.*?)</div>',re.S)
        # pattern = re.compile(u'<a href="thread-1453325-1-3.html">第205章行内人的密报</a>')
        #  pattern = re.compile(u'<span id="(.*?)"><a href="(.*?)"></a></span>')
        pattern = re.compile(u'<span id="(.*?)"><a href="(.*?)">第(.*?)</a></span>')
        items = re.findall(pattern,html)
        # print (items)

        for item in items:
            try:
                book_thread = item[0].encode('gbk')
                book_link = item[1]
                book_name = item[2]
                book_full_link = urljoin(webroot, book_link)   # 构建书的绝对地址

                #请求地址
                try:
                    request = urllib.request.Request(book_full_link,headers=headers)
                    response =urllib.request.urlopen(request,timeout=180)
                except urllib.error.URLError as e:
                    if hasattr(e,"code"):
                        print(e.code)
                    if hasattr(e,"reason"):
                        print(e.reason)
                html = response.read().decode('gbk')
                
                pattern = re.compile('<table cellspacing="0" cellpadding="0"><tr><td class="t_msgfont" id="(.*?)"><font size="5">(.*?)</font></td></tr></table>',re.S)
                section_content = re.findall(pattern,html)

                # down txt
                try:
                    fp = open(book_name+'.txt','w')
                except IOError as e:
                    pattern = re.compile('<font size="5">(.*?)</font><br />',re.S)
                    book_name = re.findall(pattern,book_name)
                    fp = open(book_name[0]+'.txt','w')
                print('start download')
                cleanTxt = cleanhtml(str(section_content[0][1]))
                fp.write(cleanTxt)
                print('down finish\n')
                fp.close()
            except Exception as e:
                print('该条目解析出现错误，忽略')
                print(e)
                print('')
                fp = open('error.log','a')
                fp.write('page:'+str(page)+'\n')
                fp.write(item[4].encode('gbk'))
                fp.write('\nThere is an error in parsing process.\n\n')
                fp.close()

def merge_txt_files():
    read_files = glob.glob("*.txt")
    # print (read_files)
    with open("result.txt", "wb") as outfile:
        for f in read_files:
            
            with open(f, "rb") as infile:
                print(f)
                outfile.write(f.encode('gbk'))
                outfile.write(infile.read())

def main():
    # 获取排行榜首页内容
    webroot = 'http://www.cangshubao.net/'
    # fetch_section(webroot)
    merge_txt_files()

if __name__ == '__main__':
    main()

文章目录