Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
3 o Q0 y. S9 J' ]3 j7 n7 B- #!/usr/bin/env python
$ A9 d& D% P8 t - # -*- encoding: utf-8 -*-
7 `# m3 R; p& k/ g; n3 ~9 u0 z - # Created on 2019-05-05 21:43:11
4 C ~7 M8 Z- C4 W - # Project: XiaoShuo3 N2 A9 u9 d1 G: a; |0 t1 ~5 G+ Y
- + ` o6 ^( K5 {! _; G" m1 _1 q
- from pyspider.libs.base_handler import *" w p6 \" n( |0 F6 B z
- import pymysql
! \0 a$ }& ?+ N& C - import random7 q% t% x2 e5 \/ W F0 s5 W1 d8 w/ v
- import datetime; S: V5 c5 e6 \0 K
- import urllib2,HTMLParser,re
& f, b8 z/ J6 Z9 q - import os
' g2 W8 E, @: N- h6 w4 m - import sys; ]; e- l5 u: T8 f5 c, ]
- import re7 o" X: D: D* {$ s2 t3 [) O
- import codecs. f& [7 j2 b* u* I) @6 W
- import requests
4 c5 `1 e) a! O8 C7 u4 I - import json
: ?: h- n- N( n6 Y4 O - ) G; Q0 @ ?+ o/ |8 h8 Q, H
- class Handler(BaseHandler):; ]& H7 v8 d2 i& Q) H; \3 {
- global Datos
7 x ?2 W1 x1 d s+ S - global P_dir
& f' R1 |$ A' e7 ?4 E: p - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径. m# ^/ K9 ?4 c# z+ P
- global Datos9 k9 `% j* ?# L2 h
- Datos = {}; I4 F/ b0 G3 ^! w. Q
- headers= {
8 J( i! F6 e# H+ n( j. T' X - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
0 k% w1 |7 ?2 ^) W; |9 e - 'Accept-Encoding':'gzip, deflate, sdch',
3 w9 M) o% h0 R2 } h - 'Accept-Language':'zh-CN,zh;q=0.8',' t- \8 x( @9 V+ d" j
- 'Cache-Control':'max-age=0',
" t; B0 ^+ P* Y6 l+ v/ ] - 'Connection':'keep-alive',2 t6 [" {% z. N. T% }: _
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'* z& ^3 q( e0 R
- }
# f: W+ u* ~/ w, d0 y3 n - crawl_config = {
8 C) A6 ?/ i7 ^) Y - 'headers' : headers,$ m" y' k+ R# ]1 t/ R3 O! N, J
- 'timeout' : 300
* U: g Q( I) |. `$ Z2 F - }
- F. r1 p- g* R( y - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):7 ~- \" e$ R) q5 c3 ~5 W- f
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")/ S' Y( ^' K+ N7 b2 R' G" M, V7 N
- try:3 L6 }8 y& D0 p7 g5 f1 Y
- cursor = db.cursor()" F7 m, a# `( f: l; [
- #注意此处字符串的占位符要加双引号"%s"
+ @, n( B5 n c: l7 [6 C - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
2 p/ ]% k8 t$ w* }) ~* L - # print(sql); \% L" E3 v6 [# v3 y
- cursor.execute(sql)/ }; Q- z! ]. p" `! D# F; r. ~
- ( |* j4 p# o# ~
- #qid = cursor.lastrowid; i p3 F; `" Y0 ?' {# \, N2 F
- #print(qid)6 i" c P n- Y g- E1 `( l8 @
- ) J4 e% V9 |: f7 a/ n; [6 i
- db.commit()
. k2 ^& h# t" ~0 P - except Exception as err:
( d* f/ ^5 F2 s) |8 ~3 { - print("Error %s for execute sql: %s" % (err, sql))8 X |2 N2 k& `9 T: w9 @
- db.rollback()
' U+ `- {) p: Z& D/ X5 @ - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):* C) O5 n6 x* Y* a3 D6 y3 i
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
& B; k3 ]8 H$ r5 J - try:0 l1 g6 i8 l. d3 f" z, l
- cursor = db.cursor()3 e! c: J9 Y v: ^ U8 W7 V6 C2 ?
- #注意此处字符串的占位符要加双引号"%s"
) c8 O$ D5 p- O! v* e: T. a - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);- l; \4 I" J9 U% K. R
- # print(sql)/ L4 \) L! ]9 O1 y" l/ J5 |
- cursor.execute(sql)6 |1 L L4 I" K, C3 h/ q, R, C3 b4 R
- # d, B( L! n: y# V+ ]* T
- #qid = cursor.lastrowid$ o/ I% @$ s8 g7 O
- #print(qid)
. J4 ]! _4 _& }' p7 h" E8 w - $ B8 t/ [6 ~7 {! v" L
- db.commit()
( Q4 n9 V" y# N: f: | - except Exception as err:" I7 H2 X. D; N4 h$ {" j
- print("Error %s for execute sql: %s" % (err, sql))2 d. Q. t( N" }+ M+ f# c
- db.rollback()
) P( r$ ]1 f1 S7 |6 ~ - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
" {6 M; d! f7 H, a0 A5 B8 Y - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
7 D- ]; @( d. @$ G ] - try:
% N) i/ w1 \( B2 \: N0 C% K - cursor = db.cursor()
5 Y$ b$ S5 G) s: {' N5 ]( q# N% I - #注意此处字符串的占位符要加双引号"%s"
0 }5 e8 k$ ?. h' z" V; w% Q - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
9 K: v- x( i+ P - print(sql)
8 |2 z& I# r& X! W x, U$ W" S - cursor.execute(sql)
+ N: w3 T; T) T" A8 K - print(cursor.lastrowid)( {9 \0 @( h' k- C' e' y3 ^. S. ~
- db.commit()
+ @4 r7 F2 W0 W - except Exception as err:
' C0 j8 ^3 o. H- L/ t" d6 i1 c# E/ }6 M - # except:8 ~# d+ x* ^5 F8 R: A' {
- # print('Failed')
: I( V2 K! g) W$ H; i6 ]2 b2 F - print("Error %s for execute sql: %s" % (err, sql))% c1 L- |6 N# ~* U, ?- z1 G
- db.rollback()
' [6 n, ^* x8 N! J n7 P -
! [, ^4 v E( Y5 [: @8 H, A# [ - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
1 ]7 D3 g2 x! Z- w - reload(sys)
# ^, X; E, r: s( e6 G: o$ X* h - sys.setdefaultencoding("gbk")
! d; R y6 e" J - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址 ^3 q! p$ X- `* ?2 A) M
- locoy_data = {$ m$ c# {0 b- T/ M
- 'my_u':'用户名', #后台用户名4 I! u$ Z1 I. B- T7 Y ]
- 'my_p':'密码', #后台密码
, d. w& |/ I; [( x - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),! E; I9 f9 v* v6 j0 \
- 'caid':Cater_Name.encode('gbk', 'ignore'),
$ i) B6 i( h8 a: B2 G - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
/ T: w+ A* b9 {1 t2 e( s - 'article':BookConte.encode('gbk', 'ignore'),/ ]+ D: G7 f l# k" x; b
- 'author':Book_author.encode('gbk', 'ignore'),( [4 L4 _) U& V. V- V& X: _' p
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
" _7 f& {$ o1 [& K: l - 'thumb':Book_img,
1 g! c0 U' |" L8 U6 g. ] - 'content':Book_Introduction.encode('gbk', 'ignore'),
4 L5 }0 u& l1 H' _7 q - 'abover':abover.encode('gbk', 'ignore')
) y/ S( Z0 _- H) x - }# F+ Z) h% u- C& ]% X1 O
- res = requests.post(locoy_url, data=locoy_data)
Y/ K0 G# k0 U3 K, W) {$ q( S4 v - print res.text
" }! v5 E" t$ Q( m: v( L - print res.content' }9 u! }9 c5 R0 i+ X& ?
- # print Dsd
: m8 @, x2 f* Y9 p4 X - return res
R" Z2 `* r7 H+ s8 } -
- E$ P' O6 L& M7 s& R, i - def __init__(self):
3 z6 J% y* N9 c- n. K - self.base_url1 = 'https://www.****.cc/') j( B5 {; U" l1 S4 w0 |, P/ b6 p
- self.base_url2 = '/'- H. M. t0 ?) \5 {1 Q0 h
- self.CaterId = []1 w" j4 D2 x! x! K, r0 U
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
/ M1 t3 A" Z! ?3 F } - self.page_num = 1$ j; L. d* k4 B
- self.total_num = 200
9 G6 h0 C$ p) B' V# Z0 o0 B - 7 D* `1 X- s8 S9 h. E0 n
- @every(minutes=8 * 60)
( J, W4 W! _4 [4 G - def on_start(self):. S& w$ i# L: F) Q9 K1 Q
- global Cater_Name
) l4 z# |1 v/ ]+ O( e. r - Cater_Name = []4 X( J/ f _( u9 o2 p
- while self.page_num <= self.total_num:
% G4 }% R" ]2 p - for self.CaterId in self.CaterIds:
P) `8 }) d, j! A - if self.CaterId == 'xuanhuan':
' r& J1 L! ?% D" I4 X - Cater_Name = '玄幻'
* d) U) E: q2 q - if self.CaterId == 'wuxia':* b& k7 V7 ^; G3 \0 \/ R8 ^
- Cater_Name = '武侠'3 w: R! K. T) i& v& w5 I4 p! v: \
- if self.CaterId == 'lishi':% T- E9 Z: e( J1 [) q- h7 ^0 S- e1 b
- Cater_Name = '历史' - c) s4 q8 J u+ _$ S
- if self.CaterId == 'yanqing':
" D7 n, ~3 ^" ~( c - Cater_Name = '都市'
% v! R) X9 ?/ M7 k - if self.CaterId == 'nvsheng':
7 s: s9 o6 m4 B. H - Cater_Name = '都市' . S; s+ T* d$ J
- if self.CaterId == 'kehuan':' j4 E6 t8 [! B$ F) ^9 k" l! y
- Cater_Name = '科幻' / X, p' k# ?) x- S1 S
- if self.CaterId == 'kongbu':
% S9 _" g0 o$ C( l - Cater_Name = '游戏'
0 _& l5 B" S1 K5 l - print self.CaterId
' R/ {+ i: B7 ^5 J! ~4 I0 Q$ V - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" % _6 B: X q8 a
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
8 x5 S# W. p1 }* m! ?, h0 r - self.page_num += 1 - w0 @# d. g2 ]8 K! o6 G1 |
- 0 B# t5 \' L+ L N" Q0 r( J, i% } P
- def list_Caterg(self, response):" p( @1 i, K) E$ e# D% B/ V; w
- Cater_Name = response.save
5 b3 [' [: i8 I1 C. B, B/ b( W4 }0 Y5 q - for each in response.doc('.pic-list a[href^="http"]').items():4 y! k3 }8 c! |% y
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)% ^1 A% v8 @. j- C9 ?
- + }) v" [5 h# V d% X) `% B8 p
- def list_Caterg_detail(self, response):
2 h5 X2 p, {5 x3 }& z z - Cater_Name = response.save( O+ C: v, I2 h. k/ R0 b+ _
- # print Cater_Name; A0 r3 u9 v& j; z$ j/ P$ C m4 F- T
- Bookname = response.doc('h1').text(). z) ~8 J7 L; f
- print Bookname& N, g0 ?5 d# }0 F: p/ I. l
- Book_author = response.doc('.authorname > a').text() m# F! G4 z. o$ [
- # print Book_author, o/ ]% m, @: T$ k9 J" O
- Book_Introduction = response.doc('.book-intro > div').text()! c, i6 Y. H: n" v" U
- # print Book_Introduction
4 s, L7 J$ ]1 ] - Book_Synopsis = response.doc('b').eq(1).text()
7 e" j4 l+ w9 H- z3 X6 n0 t+ E: T - # print Book_Synopsis' h# m5 q! ~1 W! I$ `$ U
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
) m9 S& }- l/ w9 p! B - # print Book_Palabras
6 m3 g6 U/ B- e* q( p9 x' U8 [ - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
8 I; M% P- p. E4 B# u0 G# ` - # print BookIDs
3 Q) ?; h7 X* U1 ] - Book_Dates = str(datetime.datetime.now())
/ P; `; f4 e* N6 |2 P - for imgs in response.doc('.bigpic > img[src^="http"]').items():# f2 V) F0 E4 W" r
- img = imgs.attr.src
9 j& e. t: T( v# n6 @, \) z# P5 P - print img
2 x0 w! D# s$ R6 e% C( C6 f - #小说封面下载
s2 u+ a: l; }9 M! K - extension = self.getExtension(img)3 K2 `7 C7 |0 K1 K" d
- name = self.getname(img)
' u0 C6 u4 B9 j- N0 j7 i' }* P0 M9 n - file_name = name + "." + extension0 ]/ d, o6 h8 d6 d- v# I ^! Z1 n
- imgDir = P_dir + name
; O0 d( ]. V" X9 E - Locaimg = imgDir + "/" + file_name% _* h V+ n( ?3 V
- print Locaimg9 ?- U( x6 Q, Q: S
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地0 l. a# R$ e: _5 |
- print('attachment url is ' + img) #3 Z% c) N" h' t1 O8 g: `
- Datos = {+ P2 t0 m7 z. I, g/ Y
- "Cater_Name":Cater_Name,
0 C! Z# V) `# J% I7 e! V# T - "Book_author":Book_author,3 \3 b3 j0 `5 h" |* h! b# I
- "Book_Introduction":Book_Introduction,4 S9 M$ B0 T% \- L# Y8 ^. |
- "Book_Synopsis":Book_Synopsis,
) g+ e4 f( a( z7 T - "Book_Palabras":Book_Palabras,
+ S9 W: q* }! y& B2 Z - "img":img,
5 a0 T* B! ]6 }! y, R# \" ~( ~- r - }! }+ W, T7 ~6 X+ t
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
! }; W: M% k) d7 o9 I6 m* J9 x - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
+ ^* E9 _* q7 C! Q# Y8 h5 Q8 L$ k( | - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
1 m7 p- b0 o) X2 ~' {9 m -
* K# H( I+ }5 q [/ e, e# S$ D - @config(age=8 * 60 * 60) 8 q" {8 ?6 a/ @- v D8 J/ o* M6 [7 d) F
- def index_page(self, response): 7 v, b$ v q2 v
- Datos = {# p. D2 K2 g' B3 k7 D" r4 g
- "Cater_Name":response.save['Cater_Name'],
: c8 C0 R m1 a0 X) u0 J' U i0 ?- i - "Book_author":response.save['Book_author'],
3 V0 T# D4 O1 x3 ~ - "Book_Introduction":response.save['Book_Introduction'],
$ n, Z3 \0 R8 x0 m3 J - "Book_Synopsis":response.save['Book_Synopsis'],. |9 p- F w0 ~8 x; }/ y: A
- "Book_Palabras":response.save['Book_Palabras'],
% t3 |' L* H7 S. s' Q - "img":response.save['img'],
9 Q! T0 N5 s" i- G2 J: K - } H+ b: r: p" }' i# E9 H0 X P
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
- {( [; |( L( f$ d - # for each in response.doc('.chapter-list a[href^="http"]').items():
! e( D. G* T; }% e& U; Q - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
: P3 U8 ^& f8 H/ d# {4 f# ^( m - @config(priority=2)
0 y2 R, E7 |, P9 Q7 z- q& n2 [( q8 o8 X - @catch_status_code_error$ ]1 ]8 a" }5 y, S7 @, D
- def detail_page(self, response):
. e0 x2 E% i: ^+ F - NewRe1 = u'哈书'
& v0 H! g- z7 H1 B2 F3 o - NewRe2 = u'huhjsd.CC'
6 z e3 n) P' H/ h - NewRe3 = r'^\\n\\n'0 x! ~1 f! l v% T2 ?$ p
- NewRe5 = u'小说网'9 }% g7 n) q0 O7 t! W- B" O
- NewRe6 = u'fgdfgf'
% W( A" {, D, x. x3 e$ C - NewRe7 = u'fgfgf'
6 f9 }, U! s& P: E* x3 @: V9 B5 S - NewRe8 = u'ffhgf'
. h- Q( G4 M' w! h C# n - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
4 U% ^/ M8 d( ~ - ReC1 = u'静思'2 A+ q/ ~% Z2 M% Z
- ReC2 = u'aghgf.com'5 }$ b, ?" M$ p
- ReC3 = u'aghgfh.com'
. E+ ~0 c* c6 J; s. E% A3 B - ReC4 = u''
( f. u. n4 i m) `( D3 _ - ReC5 = u'文学网'
4 a0 ^" |/ M- l# W% t5 p - ReC6 = r'<BR>': L8 K7 {* h% m) H# v: |4 L" ~
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
+ ]8 t; _* ~" W8 a; W9 h s+ W! m - print Bookname3 l7 g3 F5 Q6 z. T: e
- Cater_Name = response.save['Cater_Name'] # 小说分类
4 w2 h/ j% Z) T8 M0 J0 b - Book_author = response.save['Book_author'] #小说作者1 L9 D2 M6 t, H( g# ?5 R
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
; [) E, @% t3 ?. g) y; ~, S - Book_Synopsis = response.save['Book_Synopsis'] #最近更新6 Q, u4 D) S* V
- Book_Palabras = response.save['Book_Palabras'] #小说字数
+ ^+ M3 l, F0 i( I3 E7 U! M+ W - Bookurl = response.url #小说网址" I, x% }* r1 j9 }' ~5 V1 }9 E
- Booktitle = response.doc('.article-title').text() #章节名称
. J9 _3 `/ U8 D" t1 D s; L - BookID = response.doc('.readset-r span').text() #小说ID3 T2 K% J5 Y, a- x/ Y$ ?3 ~& J
- BookConte1 = response.doc('.article-con').text() #小说章节内容8 F) d$ c! h2 g# D. J
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)/ u1 S5 d" t. q9 {3 _. v0 ]
- Book_Date = str(datetime.datetime.now()) # 采集时间% `1 B, u5 m* L" p( S: D# O0 A3 j
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
0 }/ L0 X0 {! ], d5 q - BookConte3 = BookConte2.replace(NewRe2 , ReC2)) n) I' p; C `# @
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)- Q7 p# d# U T' C$ Y
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)) B" a, N9 C. f; D+ I8 ]9 M5 N
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
0 }/ l# X% V5 g0 _8 h' l Z - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
, r/ |# x% Y0 N8 ] - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)2 T% c! B7 x$ W3 K+ C6 v: P$ Z: J
- BookConte = BookConte4.replace("\n\n","<br>")
Q2 S3 g9 m' R7 q; p# G - print BookConte
& y. E9 F* p, N# w% J9 r - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
R0 Q ~; o$ X' _/ r# @' w - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)! Z4 y7 w* h! N& |0 |1 R
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)- E( s6 k9 m: k& t3 o: g( B
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
; \2 A: m7 R! J% m% C0 X: d - Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
; r$ x$ ]$ |4 `9 r! Z) s4 S - Book_img = response.save['img'], #小说图片8 `/ m( g& B9 i' P, w6 Z% _& r
-
, a( k$ f+ D( Q1 O# U* g- l - #insert into MySQL 小说入库( f [3 V1 {* l1 a% q9 g
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
9 T7 {$ b( `) t, Z - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布' u, P; }) o5 _2 ^2 S2 E
- #post提交发布) X. N; T2 c& |. p) ~
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
- k: D$ J b4 f0 @* T: \3 t: [; _ - Datos = {7 Q; D7 z4 m2 m* r+ f
- "Cater_Name":response.save['Cater_Name'],% U+ R6 U) D, i
- "Book_author":response.save['Book_author'],% {4 q/ ]$ f+ t5 `* U
- "Book_Introduction":response.save['Book_Introduction'],
u6 z$ N$ W8 I - "Book_Synopsis":response.save['Book_Synopsis'],; ]( x [9 v: Y1 r! w+ r4 Y8 I* s
- "Book_Palabras":response.save['Book_Palabras'],; m& r2 `/ U4 e5 A4 Z+ {4 ]' j
- "img":response.save['img'],/ d6 E' Q8 Q' k9 X2 ?- ^
- }3 t4 e5 o# Y6 h: ]0 A
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
' c+ H: | F8 W - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
1 O5 o Z O" J* N* K" U8 Z - return {; e3 s s3 o: N0 D; \# j
- "Cater_Name":Cater_Name,
! F. u+ L, s" \. O2 N - "Bookname":Bookname,8 n* C: H8 ]8 i" b: v/ v: [
- "Book_author":Book_author,% e; Y& y+ }5 s3 v/ p9 z+ W
- "Book_Introduction":Book_Introduction,
8 o1 v8 U! j$ N9 s G$ x - "Book_Synopsis":Book_Synopsis,+ L. `' C- q3 \& m: c
- "Book_Palabras":Book_Palabras,3 z: @9 s# H" y) ~
- "Book_img":Book_img,8 U7 P `. d) s; m
- "Bookurl": response.url,
4 _/ w# r$ }* S4 g1 n% ] - "Booktitle": Booktitle,6 o* j5 r9 V9 l z% ^6 ?& [6 |
- "BookID": BookID,
+ h; }, |0 E) Q: u3 J& ^3 L- d8 D - "BookConte": BookConte,% `0 M; B" }! o
- "Titleid": Titleid,+ A+ Z0 u$ a, r8 @4 R
- "abover":abover,
% b8 {( Z4 u: e* B* b/ X) K - # "Book_Date" = str(datetime.datetime.now()),
5 t' k/ c8 `9 N% ] - }
9 R7 F% U7 |) Z - def download(self, P_dir, imgDir, file_name, Book_img):2 N: d% I7 X) N/ _7 F) Q& i
- if not os.path.exists(imgDir):
0 y4 a9 f9 s: i- V0 u - os.makedirs(imgDir)
6 \9 b" \. q: a7 I# W* o - file = imgDir + "/" + file_name
9 d. a, A* X5 P - # print file' d4 b* E! r# q) V
- f = open(file, 'wb+')
0 g& S: C' R3 p - imag = requests.get(Book_img) " u+ t3 ^: o: B/ a. z
- f.write(imag.content)
- _. A) k V; W% t - f.close()- d1 i& W0 m) X
- #保存图片前( y7 h8 T4 r/ j0 I1 ]2 R) g
- def save_imgs(self,response):$ _' E. [9 I# B" ?8 D; z- c
- content = response.content
( [0 C1 y) i2 [4 y# l$ }% v - file_name = response.save["file_name"]3 e3 s; z/ _% J7 w R* E# C+ z* l$ d
- imgDir = response.save["imgDir"]
: C5 ^; |/ r/ J) I - file_path = imgDir + file_name0 x+ Q' Q5 A, f4 x5 f* O2 l
- self.save_img(content,imgDir,file_path)! s8 H+ Z) I3 R3 h. \6 K% E
- #保存图片" l0 g9 B @% F7 T E7 G$ p
- def save_img(self,content,imgDir,path):
7 m1 i V4 b6 ~ I; d - if not os.path.exists(imgDir): / Z& p$ s# \6 ~, p* O
- os.makedirs(imgDir)
9 ^7 q1 }) Z6 }+ f' D - f = open(path,"wb" )
) \9 Z w8 J: d - f.write(content)
( q5 ]3 [, w Z2 t" e' W0 H - f.close()$ Z8 ^* N0 M4 J V
- #获取url后缀名: r- k( f3 U4 n7 G! U- U
- def getExtension(self,url):
2 n; w- J4 u6 k/ w - extension = url.split(".")[-1]! u+ ^ `% B) c( t, O& z
- return extension 1 g3 ?* Y- l. W3 G3 s9 k. z
-
, y& @+ Z n2 o/ k( n - #获取图片名1 B; v+ J. G5 K2 G6 v" B
- def getname(self,url):
: C+ Z- y2 [9 t& a8 o& [ m" O- E - name=url.split("/")[-1].split(".")[0]
& {& r. m% M0 Q% N, c( w- p - return name
复制代码
- ~5 l. x- z( S0 D0 j . z! g, N6 J) O
|