Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
9 Z m+ R* v e6 B$ E4 E: F1 K- #!/usr/bin/env python2 u2 r; o1 C! M) H& A1 O
- # -*- encoding: utf-8 -*-+ L' _% X) H& M7 R3 G
- # Created on 2019-05-05 21:43:11
% {# I! b3 e, l0 }7 p - # Project: XiaoShuo$ ^1 a# W5 A7 m$ Q
-
( N u0 c2 ]/ R n3 q) a - from pyspider.libs.base_handler import *
; `+ t( D! Y8 w+ Q+ @6 |! f3 Y - import pymysql P$ ~) T- P* T* O" W
- import random
0 b* g }8 r1 n - import datetime' e3 v- w0 g2 J8 W0 G' k
- import urllib2,HTMLParser,re7 e9 }* n' h, R, {! M
- import os
, T3 V B+ U! a ^. a - import sys5 D; }/ m# }: z) Q G7 C9 k
- import re
U$ x& |# [0 {8 c; o" u - import codecs
5 }1 O' o2 H- S! x/ }; C - import requests+ o0 m# i; R$ ]" ~" G% f3 f
- import json
6 m1 b8 D! n5 }' _ -
: \$ a$ p5 U0 a - class Handler(BaseHandler):2 V2 I4 l; y0 ^$ z2 f: c
- global Datos# E2 H: M5 L* V; C, c) Z
- global P_dir
; J- y' J( j- {8 { - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径+ [9 d8 e( {' r# z# [4 K- f$ O
- global Datos; H" P6 F8 Q3 \& M
- Datos = {}
9 R$ t i& ^& k0 I. f" u - headers= {
8 f- [$ [6 X9 K+ T! d) O/ e4 z* F - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
4 e' P# B6 L3 v. o/ N0 \' z - 'Accept-Encoding':'gzip, deflate, sdch',( t. l7 o* X& R! t. g
- 'Accept-Language':'zh-CN,zh;q=0.8',
+ |+ s/ I7 E( X; t; V - 'Cache-Control':'max-age=0',# o/ L7 i; q+ Z2 {+ C: ]
- 'Connection':'keep-alive',
R& F9 M# h6 B3 t$ T; a( T - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'$ k4 O) [. F9 D* U$ l5 e
- }
/ E5 t. I q2 V0 a$ ?. d3 b - crawl_config = {8 X* q5 Y+ K% ?! b" x- U9 B4 m
- 'headers' : headers,+ R& f$ o0 I: @7 |6 @9 g
- 'timeout' : 300' d+ E, l; f# K& z+ [. u( `! n4 D1 ]
- }
" A9 [5 f' Q& L' J: Q/ ~ - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
8 g# R+ v) n: c$ @" j( F - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")8 f( P2 n3 u& u/ [' C
- try:
- S9 r$ d0 j# h( I7 a - cursor = db.cursor()% R- P7 _; V' h$ R. u
- #注意此处字符串的占位符要加双引号"%s"
* }. b! p0 ]/ l6 }; U - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);6 Q4 z( L( [" H7 W& _0 ~! J
- # print(sql)
3 N! T" R! L1 a) o; g1 j - cursor.execute(sql)) b: w" g) g" a& z" c4 z" ?
-
& O/ `! u: i1 Q9 z8 X - #qid = cursor.lastrowid+ h% M% S5 d: V" p0 m, I8 D+ A
- #print(qid)
9 N" D! Y# v4 H' f' y4 z% w Y - 4 \( L' S' }) b. ^% R1 D, x* p! {
- db.commit() O; k8 Y$ V: Q* l) f$ V: m2 \' \
- except Exception as err:
8 x$ g; C, ^, G% @2 v - print("Error %s for execute sql: %s" % (err, sql))
6 w$ h" ?* A+ ^/ n' m: O - db.rollback()
. c( B* h& h& J7 j - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):6 E' }% M Y. s# H1 h! ^
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"). W' [1 P0 v1 U* x( _' x) p; Y: j
- try:
+ F. q+ `3 F, n$ ^) R2 z - cursor = db.cursor()
+ ]$ n; ^/ r# [ - #注意此处字符串的占位符要加双引号"%s": m& y, _- B4 w% q* Y
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);- ?: K9 j7 n, R: H7 Y, w9 \" F
- # print(sql)8 A5 H. d: |7 D5 P- @, H' [
- cursor.execute(sql)
2 _8 [: U" { N- S9 O - + ~! T4 K3 T+ z8 X' @
- #qid = cursor.lastrowid' ]2 q- }! m1 E1 l. f8 W
- #print(qid)
; S m8 l! U& [$ V4 x# n - 3 _8 P( c4 O7 }) h
- db.commit()
! Q: ]' G1 Q7 N: z! I/ | - except Exception as err:2 n- U7 n- }: j7 L2 S) A# }- R
- print("Error %s for execute sql: %s" % (err, sql))
; R$ d" X( ]0 k/ ^* d" w* B - db.rollback()
+ E1 X0 E+ h: M6 p4 A7 y9 U - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):: h; z$ F& w9 m! C) p0 n
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")0 v; `* w% \% B f: X) q3 D/ o
- try: _2 l3 j0 F* r9 h8 H, T: u/ {
- cursor = db.cursor()
6 } _( A+ T' S' @3 I* _ - #注意此处字符串的占位符要加双引号"%s"
7 h9 k2 }' ~3 x" E, K3 ^ - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
% l& n3 I8 S$ w, e, p - print(sql)
/ L+ ]+ ^5 p& g# J& r/ x, m - cursor.execute(sql)
/ t8 t$ \7 S! O5 C: N - print(cursor.lastrowid)
# h2 S. E) K8 g - db.commit()
. h. {* r- F9 L - except Exception as err:
. ?9 X) q- `, S2 I - # except:
. A' b1 c2 X- ^ - # print('Failed')
- B! h3 r @% m! l( w) x. R# r/ R - print("Error %s for execute sql: %s" % (err, sql))
: a0 L0 S2 ?4 }4 r; Y6 Q - db.rollback()
+ X4 m$ T- ^ d2 } -
. o8 d, i! |, ^) z# d+ Q - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
( c- Z( a7 `0 u7 h" V0 Y8 F - reload(sys), a. f: A7 `1 I8 f% O" `6 t c
- sys.setdefaultencoding("gbk")& F' X+ c- m. E- ]1 O
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址4 t2 R/ X& g! Q8 A. s& ]
- locoy_data = {) N5 p7 P8 L% g1 L) C/ A8 w
- 'my_u':'用户名', #后台用户名; p( X. g: w5 [) s: g
- 'my_p':'密码', #后台密码# o. m% c2 F9 s% F: i5 a2 {# |1 Z& X
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),& _. y2 w+ m! _: B
- 'caid':Cater_Name.encode('gbk', 'ignore'), b# R3 P$ S K0 ~0 |1 O( ?; L* K
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),0 [9 c( U% U# k
- 'article':BookConte.encode('gbk', 'ignore'),
1 J0 Z. e( a V7 ?9 U5 m4 b - 'author':Book_author.encode('gbk', 'ignore'),
" u( j8 B; f! A* v$ t9 S$ { - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
# X( i: w% D/ |4 X* p - 'thumb':Book_img,
8 F& W" Q- k* @# {; O3 \, N6 a - 'content':Book_Introduction.encode('gbk', 'ignore'),
6 W) u6 }0 A" i1 x1 q6 U/ \$ t4 A - 'abover':abover.encode('gbk', 'ignore') * X$ T' V/ c/ J3 B, @! X3 `
- }
' B2 ^" j) t% W6 d9 C& O+ f6 N - res = requests.post(locoy_url, data=locoy_data)
2 A" r' c5 K4 @3 @' @ - print res.text1 z: i& s7 l/ b4 x- ]: Z
- print res.content" c0 h' R& M, \6 }, }5 k' _/ b
- # print Dsd
% {$ B1 e2 {1 D$ ~; d - return res
0 M8 R1 Z, W6 W' P: `/ d - + u" k/ z- F% x, W
- def __init__(self):6 L5 F$ a5 F% l. x
- self.base_url1 = 'https://www.****.cc/'
# w8 Q8 k e5 L9 ~8 b - self.base_url2 = '/'& W3 i, P }) J/ l. O$ u; \
- self.CaterId = []
4 W1 j) H! M$ o - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
% F/ m4 f2 e6 y% w) g' D# [ - self.page_num = 1! P& @* A& N7 Z9 _2 a- `: r- m
- self.total_num = 200 ; e O/ c& E' Q: C6 r. x
-
# p7 z' j2 h9 Y4 U: u3 ~' N - @every(minutes=8 * 60)
* ^* q, n) G C7 C9 P V - def on_start(self):
( U6 P* Y! i. m9 ?5 Z - global Cater_Name2 l' h; ^+ z9 W4 W! I3 s* }
- Cater_Name = []
# t/ d4 T d5 c2 b9 ]$ A* N/ O/ v; O - while self.page_num <= self.total_num:
% R7 l$ m: Y. | - for self.CaterId in self.CaterIds:9 A0 k0 q: e4 b+ V6 R# b0 G+ W5 Q
- if self.CaterId == 'xuanhuan':
/ y1 B) b) j4 d# R; c1 V - Cater_Name = '玄幻'
9 f3 t! ^# o0 j% A( ?) [2 c `5 a - if self.CaterId == 'wuxia':
4 ~: K/ O: l$ o1 c* w, U' p- ?! w( E/ s - Cater_Name = '武侠'
- p& u% ~. \- ^3 h+ G o - if self.CaterId == 'lishi':. V0 S4 C$ K4 R3 c4 ~4 h
- Cater_Name = '历史' ) ]; Q. x; o K
- if self.CaterId == 'yanqing':
2 g, z2 V0 Z8 A7 Y% p - Cater_Name = '都市' 4 t& [( F" T9 b1 j v
- if self.CaterId == 'nvsheng':5 @! c5 S$ k: Q- w0 n8 L( Y
- Cater_Name = '都市' $ p6 J1 ^' D3 x0 T; ^
- if self.CaterId == 'kehuan':
% P4 v) M/ F2 I - Cater_Name = '科幻'
1 m2 \1 j, G' a. a* M - if self.CaterId == 'kongbu':: T v. [# }$ S b) U
- Cater_Name = '游戏' + l' q4 i9 z3 Z/ w
- print self.CaterId3 b8 Y( ~! t3 s4 ]/ }5 f2 I
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
$ u4 L/ P0 E; Y7 q: G; K! Y# k - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
6 H/ |9 P! k$ K( l+ z5 { - self.page_num += 1
/ W, W4 Y, @ Z3 ^1 t4 S0 d" r -
, g+ O7 p e) R7 A4 p - def list_Caterg(self, response):3 w9 K. b% w& N; I
- Cater_Name = response.save v' x8 {9 P# [
- for each in response.doc('.pic-list a[href^="http"]').items():
) I* h0 k; n( U8 b9 l - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)8 D+ P1 s4 m" F
- " G% r* o! U4 W! g( f
- def list_Caterg_detail(self, response):
+ y" s: S8 K+ v/ F6 J3 ] - Cater_Name = response.save! _, g3 t* Y- ^% K* n+ D
- # print Cater_Name3 t( H% h3 K6 @8 O0 G7 s" G
- Bookname = response.doc('h1').text()
1 O1 U0 C, \8 L1 ^5 Q/ h - print Bookname
. u* w9 E; E$ _3 y. c) \8 f - Book_author = response.doc('.authorname > a').text()
( d6 Y5 I1 ^3 \+ X% _ - # print Book_author
, H! P- u# Y* s1 J" k7 U( w - Book_Introduction = response.doc('.book-intro > div').text()5 J M! |! f4 ]* w* s$ u I
- # print Book_Introduction
, D6 N1 |+ ~" E) S: s4 l5 M - Book_Synopsis = response.doc('b').eq(1).text()& q/ L: \3 e* \% v
- # print Book_Synopsis8 Q6 t1 T9 a: O8 h, X0 V" A
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]/ z3 {+ k* U: L* [
- # print Book_Palabras _, b$ ]# G' G0 y$ a
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID" E8 R. R P+ p7 ]+ n p# s0 X! `
- # print BookIDs
9 h0 l) l5 i3 V) d0 N3 { - Book_Dates = str(datetime.datetime.now()) 2 _6 ]* W: w0 P
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
! \' } L+ l0 y' N% C* N' i; X - img = imgs.attr.src
5 S' _% {) u- h+ H - print img
d& z- l$ W$ }( R/ F - #小说封面下载
& Z4 b& d0 n5 S/ L9 _; G - extension = self.getExtension(img)2 P3 l, E' j% ~1 e8 |( J
- name = self.getname(img)2 k9 ]1 j' v2 U' {; ^4 X9 M
- file_name = name + "." + extension* X4 X! n: s. M
- imgDir = P_dir + name; G' r6 k" n _2 P P6 D
- Locaimg = imgDir + "/" + file_name n p8 b% N% f" t- p
- print Locaimg6 q( `( s% u: n0 S8 \* N
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地; a6 _! C* E1 ?( c! V& ^
- print('attachment url is ' + img) #
& Y" l- D5 @- J7 W! R: z - Datos = {* [+ k; p% D6 q5 ?
- "Cater_Name":Cater_Name,, k8 e2 H0 g8 t( U1 `+ n! g
- "Book_author":Book_author,
) X; W2 @5 j1 P& Z, a6 o - "Book_Introduction":Book_Introduction,
2 |# @- K; b* {& k3 i0 n6 M( {6 M - "Book_Synopsis":Book_Synopsis,
1 \1 k% ^$ F. s2 j0 x$ a - "Book_Palabras":Book_Palabras,
" u8 L2 I5 f; X5 ^1 P - "img":img,; S0 N8 E) m9 A, {1 n* {
- }
- d; l! T7 l- j2 E6 M: M6 h - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布1 H8 y, `+ Q( O! a+ }; G
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():! g: K* \2 m0 e( E: R2 H1 {& B
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)$ D9 W8 k7 @0 N, [* l; a5 _! l
- 8 u4 R- T0 F" C/ Y
- @config(age=8 * 60 * 60)
1 D2 ~/ ^* z$ K+ N5 _ - def index_page(self, response): . O' f$ P, S; @: C e
- Datos = {
8 ~' W0 h; L. U# z/ W - "Cater_Name":response.save['Cater_Name'],& {/ `; B. \8 t+ ~6 |) n
- "Book_author":response.save['Book_author'],
7 _/ P8 p& q! e - "Book_Introduction":response.save['Book_Introduction'],) u5 E/ y, a6 ~ N8 O
- "Book_Synopsis":response.save['Book_Synopsis'],
; w$ r& D+ D+ j7 t) _- A7 X - "Book_Palabras":response.save['Book_Palabras']," b4 `" Q4 H0 T |' r
- "img":response.save['img'],& B" w0 e/ \! a% o$ L$ J. c* Q
- }9 e0 j' [0 q: ~6 V% J) `+ l
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():5 _+ ]2 J1 [" _! B6 w
- # for each in response.doc('.chapter-list a[href^="http"]').items(): ( _! C7 [3 R' C8 j b, y
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)' y" Q- w( V9 I4 ~6 @+ S
- @config(priority=2). v) F( o; U0 n* u, i3 @
- @catch_status_code_error1 T& z2 F) I. K- I
- def detail_page(self, response): 4 B" E$ H5 a0 b! ~8 e! x# h
- NewRe1 = u'哈书'
+ ]7 g! B, \( i2 Y4 \ - NewRe2 = u'huhjsd.CC'' Z: ?9 Y6 f! A* F0 \$ b
- NewRe3 = r'^\\n\\n'" w* r, l: L! Z }
- NewRe5 = u'小说网'+ z) e; e3 W) R) M
- NewRe6 = u'fgdfgf' ~' O3 c! V* K+ A
- NewRe7 = u'fgfgf'% ]5 E* d+ n; c* e, Z" _
- NewRe8 = u'ffhgf'
$ q/ y! q0 \% O6 u4 U8 W - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
) h' U4 @7 u, ~/ f - ReC1 = u'静思'1 w9 H! ^+ O1 N3 _6 ^8 ~
- ReC2 = u'aghgf.com'. W5 d5 \' @1 Z7 t0 c H1 M+ p
- ReC3 = u'aghgfh.com' O8 D. z2 O* y4 U! v+ v9 B. Y
- ReC4 = u''. g5 g, h2 J" Q- M: U2 S
- ReC5 = u'文学网'
2 Z$ b$ s* z# N6 v" F/ W S ?4 J5 w - ReC6 = r'<BR>'
6 f+ O, R5 o- U; J: I. l0 J. S0 T - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称6 s% l& G q7 [( A& j) j5 [9 l
- print Bookname* Y8 K8 P6 }8 p `3 S
- Cater_Name = response.save['Cater_Name'] # 小说分类2 W: q$ m$ T1 Q4 a
- Book_author = response.save['Book_author'] #小说作者& {; T* l- d/ y+ @( [0 `7 l0 I
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介 N$ J x' c$ {4 [% C
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新% t, K7 W" _; x
- Book_Palabras = response.save['Book_Palabras'] #小说字数/ M4 I r0 I& }
- Bookurl = response.url #小说网址
, x* G; L- s3 B2 e1 _; V$ [# e* p - Booktitle = response.doc('.article-title').text() #章节名称
8 p$ ?# U$ V' U1 Z( b& `4 G - BookID = response.doc('.readset-r span').text() #小说ID
8 n6 N) T/ n2 y% k, r# w- ] - BookConte1 = response.doc('.article-con').text() #小说章节内容; `3 e S) ~2 M
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
; @- e) `6 w0 i8 W4 Y& i3 w - Book_Date = str(datetime.datetime.now()) # 采集时间& o$ _) e8 l3 R% k" X6 h7 C; E7 J
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)" a! J7 R1 I) V: L/ C
- BookConte3 = BookConte2.replace(NewRe2 , ReC2), K- A+ \" T& S8 A. }
- BookConte5 = BookConte3.replace(NewRe5 , ReC5): j) w' P' L# q$ \8 A3 i5 U$ O
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)5 @. |7 R# X3 T$ d9 @$ R4 C2 U
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
' W1 z% k7 L7 {5 q7 y+ y - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
2 J0 x: b Y- A - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
( m U$ B7 _; E3 b* Q- i* L: E; C - BookConte = BookConte4.replace("\n\n","<br>")4 u1 c% P2 h+ e! L Y' [
- print BookConte
9 |( |# K" l! N( O( w# T9 C - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
I; a z* A7 o/ V# T - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
9 }9 N* A2 k6 u. a6 B# f - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
0 c8 T, Z* z& g2 W - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
$ ^+ z! X- ?3 v - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] / N* j9 i& N: l- n; G
- Book_img = response.save['img'], #小说图片6 q0 n' ?0 d% D, K' m, U+ p
- + k% W1 c1 f) \( a% R0 S8 X
- #insert into MySQL 小说入库" d/ m. ^0 J# t' m/ w& K: {
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布8 F0 J6 Q# H* @# [" h% A$ w
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
# a9 _8 }; `" Q, Y$ L8 u6 s - #post提交发布; m4 x+ q; d# h5 f# M
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
( ]3 B% E4 V U: M7 U N - Datos = {, F- j3 y. z( Q6 w$ X9 \8 }0 {
- "Cater_Name":response.save['Cater_Name'],) f. y9 v; u. Z7 Y* S$ ^6 q8 Y
- "Book_author":response.save['Book_author'],) D \9 n) g u( ?
- "Book_Introduction":response.save['Book_Introduction'],# u9 w5 a" g5 i+ ]& z6 `7 H
- "Book_Synopsis":response.save['Book_Synopsis'],
3 T# g0 D) K4 q: u - "Book_Palabras":response.save['Book_Palabras'],, F" d, }! Y+ q. H0 ]# i# z
- "img":response.save['img'],
# F/ c# y; ]. N' u6 }/ Z( ?+ O - }4 S( z+ ?. s6 { r ^" y
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():" `: f/ z. x# x* }! V
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos) + g) N4 R. s0 [6 Q
- return {
0 R T3 A; q/ D1 f) f6 x! j/ j - "Cater_Name":Cater_Name,7 n, V4 h( E* P" \/ H/ e
- "Bookname":Bookname,
4 ], C# b p* ^/ D/ ` - "Book_author":Book_author,
$ I4 F' n/ M1 r1 c - "Book_Introduction":Book_Introduction,4 {# T5 ?7 |. n( Y8 w6 x6 f* R2 G% W
- "Book_Synopsis":Book_Synopsis,( M$ ?0 Q/ _. b" f, H( \3 v% ?
- "Book_Palabras":Book_Palabras,- x' s' U% i7 ?4 o) b- i; W9 n
- "Book_img":Book_img,
$ o# g4 H1 d4 c* J# h# X - "Bookurl": response.url,! l0 Q* f6 z9 ]" P* L
- "Booktitle": Booktitle,( h; T( y1 P% J( j$ _
- "BookID": BookID,7 a- m5 K0 q6 e q' r& M
- "BookConte": BookConte,1 E8 j: s3 T& M5 f$ i- Y
- "Titleid": Titleid,
9 `0 S3 e( X5 O9 i( W - "abover":abover,
3 K$ t6 M: ]8 N; @, r6 k - # "Book_Date" = str(datetime.datetime.now()),
6 o3 a) d: L: h% |8 J - }
5 H1 R! Z8 M: P9 g- S - def download(self, P_dir, imgDir, file_name, Book_img):" z) l+ I. e1 L; x$ u: e- H
- if not os.path.exists(imgDir):
0 W: Z2 D L& Q - os.makedirs(imgDir)' |1 a3 ~& h( O- P% O1 M, _& W
- file = imgDir + "/" + file_name" P" T( s% R( r6 }3 H5 R
- # print file
1 r4 d1 r& l! }" g. L: j q. Q - f = open(file, 'wb+')
; ^0 b1 X# N: \8 K- D! U; S5 T - imag = requests.get(Book_img)
1 z# ], q, ^' ]0 G - f.write(imag.content)
5 z5 b# q! p. M/ J+ D W - f.close()
$ i" F; L* y8 d9 |, T" B9 | - #保存图片前) O6 V' _+ [( Z" @& r6 O
- def save_imgs(self,response):
n3 ?# _! B! Z - content = response.content
- n6 T9 g$ ~' N. Y - file_name = response.save["file_name"]1 G' Q% k0 \- G+ ^
- imgDir = response.save["imgDir"]; Q R" p4 q( A$ q+ J
- file_path = imgDir + file_name0 [( S% o; ^ z/ @; ~" n! q
- self.save_img(content,imgDir,file_path)
2 L8 }7 N6 o" r0 U# R5 d. O K - #保存图片3 Z- N- ^! q- r& d
- def save_img(self,content,imgDir,path):$ i2 N; ^3 r( Z
- if not os.path.exists(imgDir):
F6 j- f& s) @: n0 Z! b7 a - os.makedirs(imgDir)
8 o2 W0 D" v( H- P - f = open(path,"wb" )+ U; y4 F) H# C0 r5 L/ t
- f.write(content)
) S( P) L6 E/ V: ^: N3 v- m - f.close()% x+ |3 c0 Y1 U% L8 d
- #获取url后缀名
6 j5 S7 c9 q! w* w+ O s - def getExtension(self,url): " H; @! O( y* `4 u) U
- extension = url.split(".")[-1]8 F3 E, P, Z7 r9 @: U5 k
- return extension . M& k/ \ _* h) E
- / U; W4 P5 W9 |: k
- #获取图片名
: e6 Y4 u! x: j - def getname(self,url):
4 b3 ]1 ~/ t& f; f$ Q( x) x2 Q3 p1 d - name=url.split("/")[-1].split(".")[0]
# V& @1 j. B6 p3 @; Q - return name
复制代码 , O7 H9 r/ S! C
" E. Y! s; P% P6 `( I, w$ z
|