分类 Python脚本 下的文章

Python小爬虫

Python写的小爬虫,用来爬去Amazon上的书籍信息,分2个脚本(一个把页面下载下来,一个分析下载好的页面),下载的时候使用多进程(结合数据库,多开脚本)进行下载

创建表语句

CREATE TABLE `AMAZON_BOOK` (
  `BOOK_ID` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '书ID',
  `ASIN` varchar(100) NOT NULL COMMENT 'amazon书籍标识',
  `BOOK_NAME` varchar(200) DEFAULT NULL COMMENT '书名',
  `BOOK_SERIES` varchar(100) DEFAULT NULL COMMENT '丛书名',
  `ORIGINAL_BOOK_NAME` varchar(200) DEFAULT NULL COMMENT '外文书名',
  `AUTHOR` varchar(200) DEFAULT NULL COMMENT '作者',
  `EDITOR` varchar(200) DEFAULT NULL COMMENT '责任编辑',
  `EDITOR_CONTACT` varchar(400) DEFAULT NULL COMMENT '责任编辑联系方式',
  `PUBLISHER_NAME` varchar(200) DEFAULT NULL COMMENT '出版社名称',
  `PUBLISH_DATE` date DEFAULT NULL COMMENT '出版时间',
  `PUBLISH_VERSION` varchar(100) DEFAULT NULL COMMENT '版次',
  `PRINTED_COUNT` int(11) DEFAULT '0' COMMENT '印次',
  `PRINTED_DATE` date DEFAULT NULL COMMENT '印刷时间',
  `ISBN` varchar(100) DEFAULT NULL COMMENT 'ISBN',
  `BARCODE` varchar(100) DEFAULT NULL COMMENT '条形码',
  `WORD_COUNT` varchar(50) DEFAULT NULL COMMENT '字数',
  `FACT_PAGE_COUNT` int(11) DEFAULT NULL COMMENT '实际页数',
  `PAGE_COUNT` int(11) DEFAULT NULL COMMENT '电子书页数',
  `CHAPTER_COUNT` int(11) DEFAULT NULL COMMENT '章节数量',
  `PRINTED_QUANTITY` int(11) DEFAULT NULL COMMENT '印刷数量',
  `FOLIO` varchar(50) DEFAULT NULL COMMENT '开本',
  `PAPER_MATERIAL` varchar(50) DEFAULT NULL COMMENT '纸张',
  `PACK` varchar(50) DEFAULT NULL COMMENT '包装',
  `INTRODUCTION` mediumtext COMMENT '简介',
  `AUTHOR_INTRODUCTION` mediumtext COMMENT '作者简介',
  `EDITOR_COMMENT` mediumtext COMMENT '编辑评论',
  `CELEBRITY_COMMENT` mediumtext COMMENT '名人评论',
  `TABLE_OF_CONTENTS` mediumtext COMMENT '目录',
  `TAGS` varchar(200) DEFAULT NULL COMMENT '标签',
  `BOOK_CATEGORY_CODE` varchar(35) DEFAULT NULL COMMENT '书分类代码',
  `PAPER_PRICE` decimal(13,4) DEFAULT NULL COMMENT '纸质书价格',
  `LANGUAGE` varchar(200) DEFAULT NULL COMMENT '语种',
  `PACKAGE_SIZE` varchar(200) DEFAULT NULL COMMENT '商品尺寸',
  `PACKAGE_WEIGHT` varchar(200) DEFAULT NULL COMMENT '商品重量',
  `TRANSLATOR` varchar(200) DEFAULT NULL COMMENT '译者',
  `EDITOR_DEPARTMENT` varchar(200) DEFAULT NULL COMMENT '责编部门',
  `CREATE_DATETIME` datetime DEFAULT NULL,
  `CREATE_BY` varchar(100) DEFAULT NULL,
  `UPDATE_DATETIME` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  `UPDATE_BY` varchar(100) DEFAULT NULL,
  `DELETE_FLAG` tinyint(4) DEFAULT '0' COMMENT '0未删除,1已删除',
  `PAPER_SALE_PRICE` decimal(13,4) DEFAULT NULL COMMENT '纸质书售价',
  `DOWNLOAD_SUCCESS` tinyint(1) DEFAULT '0' COMMENT '0下载未成功,1下载成功',
  `IS_DOWNLOADING` tinyint(1) DEFAULT '0' COMMENT '0没有在下载中,1已在下载中',
  `PARSE_OK` tinyint(1) DEFAULT NULL COMMENT '0未分析成功,1成功分析',
  PRIMARY KEY (`BOOK_ID`),
  UNIQUE KEY `ASIN` (`ASIN`)
) ENGINE=InnoDB AUTO_INCREMENT=110297 DEFAULT CHARSET=utf8 COMMENT='书';

请在数据库中自行添加 ASIN字段的值(可以分析搜索页面抓取需要的)

下载页面脚本

#!/usr/bin/env python
# coding=utf-8

import os
import time
import socket
import urllib
import MySQLdb


# 下载进度
def reportHook(blocks_read, block_size, total_size):
    if not blocks_read:
        print 'Connection opened'
        return
    if total_size < 0:
        print 'Read %d blocks (%d bytes)' % (blocks_read, blocks_read * block_size)
    else:
        amount_read = blocks_read * block_size
        print 'Read %d blocks, or %d/%d' % (blocks_read, amount_read, total_size)
    return


# 下载页面
def downloadPage(url, filename):
    urllib.urlretrieve(url, filename, reporthook=reportHook)


# 创建书ID目录
def createDir(dir_name):
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)


# 判断下载的文件是否存在且大小不为0
def existsFile(file_name):
    if os.path.exists(file_name) and os.path.getsize(file_name) > 102400:
        return True
    return False


# 执行查询语句,并返回结果
def fetchAllResult(cursor, sql):
    cursor.execute(sql)
    return cursor.fetchall()


# 执行更新语句
def updateSQL(cursor, sql):
    cursor.execute(sql)


# 开始下载页面
def startDownLoad(conn, start_asni, error_file):

    # 开始下载商品页面和商品详细页面
    print
    print "-------start download: < %s > --------" % start_asni
    updateSQL(cur, set_downloading_sql % start_asni)
    conn.commit()
    product_file_name = start_asni + '.html'
    product_description_file_name = start_asni + '_description.html'
    try:
        downloadPage(product_url % start_asni, product_file_name)
        downloadPage(product_description % start_asni, product_description_file_name)
        if existsFile(product_file_name) and existsFile(product_description_file_name):
            updateSQL(cur, set_success_sql % start_asni)
            conn.commit()
            print "..... Book [%s] download OK..... " % start_asni
        else:
            print "..... Book [%s] download failure, will restart download ..... " % start_asni
            updateSQL(cur, set_failure_sql % start_asni)
            conn.commit()
    except:
        print "******* [ %s ] download exception *******" % start_asni
        print
        updateSQL(cur, set_failure_sql % start_asni)
        conn.commit()
        error_file.write(start_asni + '\n')
        time.sleep(2)
    conn.commit()


# 连接Mysql
def connMysql(host, user, passwd, db, port):
    conn = None
    try:
        conn = MySQLdb.connect(host=host, user=user, passwd=passwd, db=db, port=port, charset='utf8')
    except MySQLdb.Error,e:
        print "Mysql Error [ %d ]: %s" % (e.args[0], e.args[1])
    return conn



# 定义数据库IP、用户名、密码
host = 'localhost'
user = 'root'
passwd = '123456'
db = 'download_book'
port = 3306
book_root = "D:\\AMAZON_BOOK"

socket.setdefaulttimeout(20)

# 商品页面
product_url = 'http://www.amazon.cn/111fsfsdfd/dp/%s'

# 商品详细页面
product_description = 'http://www.amazon.cn/111fsfsdfd/dp/product-description/%s'


# 查询SQL语句
get_notstart_sql = """
    SELECT
        BOOK_ID,ASIN,DOWNLOAD_SUCCESS,IS_DOWNLOADING
    FROM
        AMAZON_BOOK
    WHERE
        DOWNLOAD_SUCCESS = 0 AND IS_DOWNLOADING = 0
    LIMIT 1
    """

# 更新SQL语句
set_downloading_sql = """
    UPDATE
        AMAZON_BOOK
    SET
        IS_DOWNLOADING = 1
    WHERE
        ASIN = '%s'
    """

# 下载成功时更新语句
set_success_sql = """
    UPDATE
        AMAZON_BOOK
    SET
        DOWNLOAD_SUCCESS = 1
    WHERE
        ASIN = '%s'
    """

# 失败时更新语句
set_failure_sql = """
    UPDATE
        AMAZON_BOOK
    SET
        DOWNLOAD_SUCCESS = 0,IS_DOWNLOADING = 0
    WHERE
        ASIN = '%s'
    """


conn = connMysql(host, user, passwd, db, port)
if conn:
    cur = conn.cursor()
    errorlog = open('d:\\error.log','a')

    # 获取书籍标识并下载
    while True:
        os.chdir(book_root)
        start_asni = fetchAllResult(cur, get_notstart_sql)
        conn.commit()

        # 查询结果是否为空
        if start_asni:
            asni = start_asni[0][1]
            createDir(asni)
            os.chdir(asni)
            startDownLoad(conn, asni, errorlog)
        else:
            print
            print "-------- There is no Book page downdload ! ---------"
            break
    errorlog.close()
    cur.close()
    conn.close()

- 阅读剩余部分 -