Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
7 Z# ]+ U3 t1 `" J* n3 m- #!/usr/bin/env python* |% ]4 ~% {! L; y9 N! t! j
- # -*- encoding: utf-8 -*-" f$ ^5 b; z# z9 R
- # Created on 2019-05-05 21:43:11# K% ]- I, }& p! A( K
- # Project: XiaoShuo
. a+ q+ t9 |( e2 |8 U -
3 D5 A+ ~0 f$ o' L( }5 K - from pyspider.libs.base_handler import *0 V. K e. X1 |, u+ _% H
- import pymysql' J: s' H& n2 O
- import random
7 \( F1 u& r5 b# b - import datetime
/ b0 |9 k9 p- l4 M0 E$ X - import urllib2,HTMLParser,re
% ]- ]- Z5 ?8 B% [/ a- k5 d - import os! x9 B* B# @% @: f3 L3 ^' C" I* [
- import sys
& j! g; l2 ~5 o8 E% K5 A - import re
) `7 _6 p, r% R9 Q8 D; b - import codecs
( F* ]' e. b0 }! t+ v4 S$ y" ~ - import requests$ f( Y) R) k2 ]* [. a$ x' s
- import json
+ u4 h' X* ?! q1 W5 V - , F: V* O! S9 z
- class Handler(BaseHandler):
5 D0 ^7 i8 O) s7 m9 S# q$ p - global Datos) A5 x( X6 L0 Y2 u* m* D( h; a5 o6 i
- global P_dir , a' @, r P6 W( f0 @
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径) S& [4 D7 R Y( T. f0 [. r9 j" [" E
- global Datos
V9 h/ W5 i, k' k7 [" t - Datos = {}" ^% D3 l- v1 \8 T7 Z
- headers= {
$ T4 k$ X+ z2 a* U0 p1 z - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',/ A# }, B7 h# t+ b3 J0 Y3 \) n& q
- 'Accept-Encoding':'gzip, deflate, sdch',, n4 D# |. Z0 n2 C8 @
- 'Accept-Language':'zh-CN,zh;q=0.8',: Z6 A% i* N! z+ e& |( J9 W
- 'Cache-Control':'max-age=0',
9 A6 r! E7 L) k0 L - 'Connection':'keep-alive',
& i( Q! g, C A5 G/ f/ P - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36': I/ ~. T" h4 V+ b' ^# K
- }
' @' o) o5 u# j; c0 R7 T+ w- o - crawl_config = {; X# g. r" s1 ~5 r3 K8 B- D* b2 _
- 'headers' : headers,7 e5 O# z" k4 }3 W1 e9 S) A6 u
- 'timeout' : 300
4 u) y' C4 Z J% W2 @ - }
+ F; w1 d0 r. d+ H' e$ x& C - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):% \! v5 [) ^1 Q2 s( H! s8 i1 x
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")" W% V! O1 s( N
- try:
( | y$ j- c$ J! F! b) N3 w - cursor = db.cursor()
2 b; Q9 q: ^9 d2 s - #注意此处字符串的占位符要加双引号"%s"4 L% ? e4 G- h ?! c2 }
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);; ]$ E; Z+ J% T& }
- # print(sql)
$ {& Y5 T8 g1 Q, @/ d - cursor.execute(sql)
( L2 `3 A0 [5 n! u4 z -
0 `! o9 T7 Y" ?/ o* j6 m# N; O1 E - #qid = cursor.lastrowid; Q W9 v. S! P+ y. k( I
- #print(qid). j/ M( k; x# }- X, B3 y; u
-
_! N' I& M+ L: F# ^0 _9 R" J - db.commit()# ~! z+ T" ?$ `9 U1 U2 [
- except Exception as err:
) M0 Q2 ^9 N8 h6 ] - print("Error %s for execute sql: %s" % (err, sql))
( t" q, N# C; t6 W( h! c) y - db.rollback()/ R- j* s/ t' R# p
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
& j2 I% D$ w) C3 R; L$ d - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")) d1 H+ C B- L9 i
- try:
# Q7 F, S& ?: x7 |+ d - cursor = db.cursor()- @) o3 K/ t( P5 \- \" C' A
- #注意此处字符串的占位符要加双引号"%s"
" b. s' @) D) ? - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
- \5 M& l; V ]$ i) E7 \ - # print(sql)
/ E. z( v+ s) Y( i+ A( O - cursor.execute(sql)7 y6 D7 ~( k' d v
-
* Z( S; K, Q: n - #qid = cursor.lastrowid; K9 u9 t& ^$ K
- #print(qid)/ }: K1 Z8 {2 q( V
-
0 A8 s" [, W5 L - db.commit(): c# e, x( {: L1 k* X2 A
- except Exception as err:( ^- f# V* W8 Y6 D* F
- print("Error %s for execute sql: %s" % (err, sql))
6 t+ S" b) U+ P% A; Q2 C - db.rollback()
- p- p0 q' F! P/ l5 k9 d - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):/ f, T! ?! n5 R* e9 \7 |# R
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
% v' A t/ n9 m8 B' O - try:5 y ~; J- t! a3 C, [
- cursor = db.cursor()
6 v( a' d' A7 ]8 Y/ V - #注意此处字符串的占位符要加双引号"%s"
. k/ d4 P! p5 k- u; G2 U; g - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);8 U# S5 H& x" [8 E1 k9 P
- print(sql)
5 W! U$ L. f2 h; u/ a+ x5 f' J - cursor.execute(sql)) W" s6 a; h2 \& I! Q. P5 x* j7 k N9 V
- print(cursor.lastrowid)
6 R# O/ M, R5 p" e& ^3 ~ b - db.commit()
]2 j. L y& y: n r - except Exception as err:) ~+ x+ e$ z1 W2 g" f0 J
- # except:( @" {! n0 f, y6 T
- # print('Failed')) {# k5 ? O# Q2 A3 @* X; C1 Y
- print("Error %s for execute sql: %s" % (err, sql))
- Y0 A. {) m$ e9 f# _7 \ - db.rollback()5 L3 C7 C4 H) j* M
-
+ r9 ]) r: d3 L6 c - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
3 ]; N8 I6 @. o' L - reload(sys)
: j) E# Y7 V: w; q/ ?: K* e - sys.setdefaultencoding("gbk")$ ` F& @1 z( S& ^2 e, `1 w
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址0 z& r. d+ f( Q8 ?3 b7 L2 J& r
- locoy_data = {
- L4 ^( t* Y0 q! X& h1 j4 j; D- g - 'my_u':'用户名', #后台用户名
# t) a' W* Z2 s6 c0 l - 'my_p':'密码', #后台密码
4 _3 w! B- S& d6 ?6 a' H. _ - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
/ T1 ~* W5 ?' j3 Y6 D - 'caid':Cater_Name.encode('gbk', 'ignore'),% b. ^" f2 S/ [3 J( l
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),0 k Y- ? Z, W0 p3 i/ `& g
- 'article':BookConte.encode('gbk', 'ignore'),
* o& h' q; s3 _: S6 O - 'author':Book_author.encode('gbk', 'ignore'),# c4 Z2 V' ?2 g* \
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),7 i* M+ Y4 j" B3 h
- 'thumb':Book_img,: ^# V- L6 O( V, B3 ^+ n
- 'content':Book_Introduction.encode('gbk', 'ignore'),
2 M6 e3 v. y7 y: N) G$ i+ I7 N1 q - 'abover':abover.encode('gbk', 'ignore') - `4 Z( Y% U1 B
- }6 A, }8 W5 ]+ F
- res = requests.post(locoy_url, data=locoy_data). f( G% n+ A0 U0 Z3 I
- print res.text8 o+ H$ c) F k7 i3 P+ D
- print res.content/ t3 |4 v' x2 X& E! ?2 e
- # print Dsd
( D) T. F4 Y0 G - return res
3 g4 L8 s/ ^' N! r1 V# m4 F -
' Y; _6 T# o+ \8 @/ U - def __init__(self):
! D1 i; d# W4 g3 a' e' I% W - self.base_url1 = 'https://www.****.cc/'# A: |: t- ]2 b1 {, V4 h8 f3 e
- self.base_url2 = '/'
! M, O: D* @+ G6 t' Q) Y! I. t8 G - self.CaterId = []
/ H3 m( j6 N8 Z# q* ] - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
& C- i3 Y$ A# ` - self.page_num = 1
) P" E: `) s8 ~" k - self.total_num = 200 : I% H3 _2 W. l0 ]
-
; |9 \+ j2 }7 r+ h- o - @every(minutes=8 * 60)
0 W/ i* j3 W D2 p3 M# x5 o - def on_start(self):
i3 ]) z5 z# P( q8 @- k - global Cater_Name/ {8 |; I6 \$ Y5 S K
- Cater_Name = []1 J0 B8 l; `3 N. j# Z" l+ T
- while self.page_num <= self.total_num: $ t$ {1 C( J/ o$ R- T
- for self.CaterId in self.CaterIds:
$ v; |9 j6 u, N/ e8 k - if self.CaterId == 'xuanhuan':- l8 A4 n7 s" @5 v
- Cater_Name = '玄幻'
2 ]! U" i+ K, V4 v$ g' Z6 ~- J - if self.CaterId == 'wuxia':& e! l8 K2 [" ~3 l- l1 D
- Cater_Name = '武侠'4 c5 @/ `( g" o" I' E
- if self.CaterId == 'lishi':
3 O/ m9 ~/ a3 K! U* b" O3 q8 @* O9 V - Cater_Name = '历史'
* }1 |) O3 n. X7 x. | - if self.CaterId == 'yanqing':% g1 g! Q( y) x0 Q. h
- Cater_Name = '都市'
% j2 |* q% v' x9 t - if self.CaterId == 'nvsheng':* [- a; V# u; \
- Cater_Name = '都市'
& ~4 G g( `4 ]6 Y* W - if self.CaterId == 'kehuan':: N/ ~0 m, D- B) o& b+ a* O+ `2 f) _
- Cater_Name = '科幻'
9 [. f' A6 Z8 f3 Z8 h - if self.CaterId == 'kongbu':
6 n. C5 \& n2 f - Cater_Name = '游戏' 7 f1 }" h; i5 i! x6 }% y( t4 l! v
- print self.CaterId
6 _/ g9 b6 U0 V$ L3 f# R5 h - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
4 X7 s3 t& q+ f: p9 h) r8 R i* U) R - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
$ i' Z4 W5 }$ s& E$ X N( a9 F* ] - self.page_num += 1 9 C+ m0 |: z9 _( e) s& T* u* u% Q
- 5 F# v# ?% W0 Q! l* y' |/ l
- def list_Caterg(self, response):. u X. ^3 Y! k. t
- Cater_Name = response.save
& k, B# f7 V/ E+ { m - for each in response.doc('.pic-list a[href^="http"]').items():
2 Q/ G: X8 H; X- {! H - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
% F' G: p* H' h -
) o3 k. v( e( C - def list_Caterg_detail(self, response):4 z, i0 J5 [& Y7 }. a2 a
- Cater_Name = response.save
1 B- e# V! X$ O1 W - # print Cater_Name
0 d1 p2 F6 m. ?/ O! Y i - Bookname = response.doc('h1').text()0 M6 g( N/ W+ T$ z
- print Bookname
w( T& u H( |" ^& M1 J& }( i) a - Book_author = response.doc('.authorname > a').text()
- m- k2 h( ^# g - # print Book_author
; R" @9 u* R" @/ P& p1 ` - Book_Introduction = response.doc('.book-intro > div').text()
5 D4 ^6 {- S0 f( I7 V( ?6 ~ - # print Book_Introduction
8 a8 Z6 g9 L e/ n; I% c5 [6 L" r - Book_Synopsis = response.doc('b').eq(1).text()
4 J9 ?9 Z; j4 j( h p' \5 m - # print Book_Synopsis
1 q5 z9 _' b L8 J - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]7 R5 _1 d4 u. H
- # print Book_Palabras
8 g) c' z/ k; A - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
4 l: z; Y; P$ i; v - # print BookIDs
* O$ O0 X5 r/ k0 Y' [2 t - Book_Dates = str(datetime.datetime.now()) 3 i1 m) u$ I! l5 ?$ E
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
- a8 v% \3 h/ q+ z' g - img = imgs.attr.src) ?$ D, J, v& Z2 \; F' ^7 o1 v) M
- print img5 s/ f2 P! Z8 {
- #小说封面下载 ~" M; r/ y5 g+ L& L- C* @+ ~8 C8 W
- extension = self.getExtension(img): g+ _2 S' o! z
- name = self.getname(img)
- I, K5 t9 w q1 ~' { - file_name = name + "." + extension
% n( Y! F u2 ~ - imgDir = P_dir + name# z% h. V o' {* u8 H
- Locaimg = imgDir + "/" + file_name
2 y Y) \9 n8 ^* E( W2 V, r - print Locaimg
& y, \ W! `, p% w; B4 a% f5 c( w6 p! T% ~ - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
1 x' Y# G% }# i$ o, H - print('attachment url is ' + img) #
. G+ t# e0 e5 J& a) E - Datos = {, ]6 w( U9 e2 `5 h
- "Cater_Name":Cater_Name,) c+ ` w O5 o) P0 v4 j2 L
- "Book_author":Book_author,0 q. h$ |/ x0 B2 l+ V' I
- "Book_Introduction":Book_Introduction,
7 d u. N; `6 h( F9 G w, o; \6 X - "Book_Synopsis":Book_Synopsis,7 X, A% x- @0 E; [: F$ r
- "Book_Palabras":Book_Palabras,% _) F: K& R+ E$ i* E# q* l# H
- "img":img,5 G, K9 v5 H6 w9 s& b5 F6 v M
- }/ A; t+ s" W7 T
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
; S+ L9 h' I( K: A; z - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
/ j. L- F' P1 j0 v8 l! x; \( z, K( D - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
1 e8 e" P7 Q" q. D: ] -
r1 z4 j# s2 B! j6 a- b9 I - @config(age=8 * 60 * 60)
- o2 ~9 |. \ I; D7 m - def index_page(self, response):
! O# x* u0 ?; N" F$ { - Datos = {
( ^% {3 ^7 B4 Y& ]' V" ?( G - "Cater_Name":response.save['Cater_Name'],
" a+ B: Z% k. `8 ?0 v/ H/ @ - "Book_author":response.save['Book_author']," \. w& m, E$ x' i' K6 K/ z8 E8 D
- "Book_Introduction":response.save['Book_Introduction'],
3 b% |/ A/ U8 @$ N - "Book_Synopsis":response.save['Book_Synopsis'],& Z. Q) z- e% R/ u! @2 L
- "Book_Palabras":response.save['Book_Palabras'],
* N( \9 I& Q# F+ M% D( ` - "img":response.save['img'],
4 E& \" k- r( x - }
: Y& u# q# o: S8 V5 u - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():" ^( i6 A. s7 e" ~) ~2 i4 v
- # for each in response.doc('.chapter-list a[href^="http"]').items(): 9 g: @& \7 s: U g* V# |
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)7 m' V e% o k' C
- @config(priority=2)
2 [0 E- I& f( |! h: p - @catch_status_code_error1 s* c6 o+ \3 e) c* ~, V2 ]" x7 \. j
- def detail_page(self, response):
7 C' S2 P5 F5 c - NewRe1 = u'哈书'% V7 d5 T! |: Z5 T+ J9 T
- NewRe2 = u'huhjsd.CC'9 ~: h3 G! e5 M$ U3 U# ]4 f. I
- NewRe3 = r'^\\n\\n'7 x" b$ o }( o5 e+ G* |
- NewRe5 = u'小说网'" n" _0 ?9 ?, C' ]- V
- NewRe6 = u'fgdfgf'" R+ N& i( V/ i( h' F' v
- NewRe7 = u'fgfgf'
2 g% r- p. q3 R - NewRe8 = u'ffhgf'
) X! l' Q4 |1 D f. c' Q- j' n - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
4 Y! r" g5 k, i6 R0 k, H - ReC1 = u'静思'' a. c5 X4 |3 h( a
- ReC2 = u'aghgf.com'! v2 r. m e' U& h8 _6 u* N( O
- ReC3 = u'aghgfh.com'" F. y2 E1 `- P7 ^; J8 W
- ReC4 = u''
2 T8 X) T- L! ] - ReC5 = u'文学网'
2 O# h& \* F* d/ _7 J5 g4 v2 H - ReC6 = r'<BR>'
4 M- `" F7 o: S1 f1 ? - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称& g) a$ ]* Z* B7 b; i
- print Bookname
1 C4 }3 x+ \; @! I8 q* R - Cater_Name = response.save['Cater_Name'] # 小说分类: d2 i2 z' o9 e- @; b3 h# x
- Book_author = response.save['Book_author'] #小说作者% @" a, w( H& \
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介! `( w+ ?* S( b' J" V
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
8 V) M7 j- g# ^6 v5 I2 [ - Book_Palabras = response.save['Book_Palabras'] #小说字数7 M, d* B. r: A: J: [
- Bookurl = response.url #小说网址
2 |0 a1 Z2 R( r+ f6 b+ i/ v - Booktitle = response.doc('.article-title').text() #章节名称
, _8 X$ T6 \0 W/ X% G1 g+ Q - BookID = response.doc('.readset-r span').text() #小说ID: z4 }& H8 C' s% m
- BookConte1 = response.doc('.article-con').text() #小说章节内容, ~2 w3 f0 ]+ D) E; E5 s
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
, I7 o+ f: e0 ]. C - Book_Date = str(datetime.datetime.now()) # 采集时间 U, Y& I' n) w2 E- d5 t/ @
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
) k1 v/ j. Q0 Q* r! T - BookConte3 = BookConte2.replace(NewRe2 , ReC2)
4 P/ }5 N7 k- E8 m - BookConte5 = BookConte3.replace(NewRe5 , ReC5)* K" t+ G/ J N) {: C3 S1 w2 } ~
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
. H; ~/ @# ? ~; x: l$ L - BookConte7 = BookConte6.replace(NewRe7 , ReC2)7 G: v0 e4 K. k+ \2 W
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)# z6 {% q; Z- X* z _8 R2 i
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
" R4 z5 {+ P/ }" ]! \8 }! r5 Y - BookConte = BookConte4.replace("\n\n","<br>")
+ O! }; U( `( \ - print BookConte& F* d; ` I' m9 P# I: O
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
; H0 J# f% L- ?; d i9 J' S - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)) W" e& M: p; Y- E' [# @) A
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)1 G( A+ y( L5 x, p @
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
3 B' {. \& i v) R) V- V/ X) Q - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] $ m# q0 P9 A) I$ S$ d. O& h. o
- Book_img = response.save['img'], #小说图片$ Q/ `, g, @' v' z2 m9 \
- 1 m( f8 D0 d1 i9 o. F
- #insert into MySQL 小说入库 _; n* k& S! A# { V0 O
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布/ l. ^* s7 a( f# v# w$ E- |$ j
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
, M. f9 Z8 r# J; p - #post提交发布
$ J2 \7 P. O1 l \0 t% Y! { - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
6 s7 z7 X8 M6 s - Datos = {0 U, m" n* _1 v" `9 A6 g- E
- "Cater_Name":response.save['Cater_Name'],
. O% ~) z9 `$ N- C# @/ b2 }- p3 A- S7 A - "Book_author":response.save['Book_author'],& X$ f; _" N3 ^! H' w; i( C# M5 B+ I
- "Book_Introduction":response.save['Book_Introduction'],, {- W/ ]& E3 Y7 N. ?9 e% f. A
- "Book_Synopsis":response.save['Book_Synopsis'],
F h) N8 @$ |: ^) v- Z - "Book_Palabras":response.save['Book_Palabras'],
4 A9 o3 _: |1 j0 D4 V: C8 T* } - "img":response.save['img'],
& ?( |) E/ @ I3 c1 Y - }
# Z' l* A: K! i - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
& U! P; j w4 n* O0 a' { - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
D0 R4 z6 U( `$ \! b l9 | - return {
( v0 o& r+ _4 b, Q6 b - "Cater_Name":Cater_Name,
6 ]6 r; z; M( u; B8 M) S& \# [6 Q: u - "Bookname":Bookname,- W$ k8 n' \- c& `
- "Book_author":Book_author,
# z9 v" I- G" @ - "Book_Introduction":Book_Introduction,
0 u! I1 N% z. q - "Book_Synopsis":Book_Synopsis,
; z7 V' E# k5 l Y! c, b& B - "Book_Palabras":Book_Palabras,
, y! W5 F- {8 z- p& x7 p - "Book_img":Book_img,& z T# y0 y6 ]7 N: ~7 ~% k- v
- "Bookurl": response.url,
3 _/ ^5 l1 ]3 s: O5 u - "Booktitle": Booktitle,- u' F9 k0 W+ c; L7 j4 Q3 B
- "BookID": BookID,/ F& d3 G y6 e8 _0 @/ v( H- G
- "BookConte": BookConte,
4 f8 o7 D c& f+ ?. e! z& C - "Titleid": Titleid,
/ S5 s; \9 q) Z: U - "abover":abover,
3 @; {9 D! V' n% C! _ - # "Book_Date" = str(datetime.datetime.now()),9 ]* B+ @: o* g; Z6 _3 Z4 {# F4 d
- }4 ^5 s6 T& O% p' m4 Q( J5 p
- def download(self, P_dir, imgDir, file_name, Book_img):
+ Q4 E( v5 v+ r' ]2 Q - if not os.path.exists(imgDir):
) h+ R# O7 b' O; g. x - os.makedirs(imgDir)" n( f! n9 R8 L
- file = imgDir + "/" + file_name' \& R- ^) G. u( r
- # print file3 X3 q5 W: {+ e) Q9 `; P
- f = open(file, 'wb+')7 a6 I) P9 J9 F% o
- imag = requests.get(Book_img)
+ Y q! |5 K4 V% R( h( ~, [' h - f.write(imag.content)- I- L+ [4 D& A' D- O+ N
- f.close()
% z& z& T, \6 j% I* p - #保存图片前
+ U+ [* a( c" i; k Q" N - def save_imgs(self,response):
! {6 X& ]- s2 Q' P) H$ p - content = response.content* _% O. Y2 a! z1 d' m
- file_name = response.save["file_name"], B+ @9 D9 u, Z; x" p% b! A
- imgDir = response.save["imgDir"]( |) W/ e' @. i- \4 L
- file_path = imgDir + file_name
. V4 `+ B& Q1 g* g4 W3 L- b4 v6 Z - self.save_img(content,imgDir,file_path)$ M/ B! k* \! u* Q. g- W. C
- #保存图片# [/ C( l1 w f; v3 g7 @+ m
- def save_img(self,content,imgDir,path):: W4 S4 i- m3 h
- if not os.path.exists(imgDir): : F' }. L1 R7 g
- os.makedirs(imgDir)( t7 H. R: u! U( a+ B9 b9 o6 f; z! K
- f = open(path,"wb" )" ^( u0 G0 q1 p) T0 q- R$ C$ N
- f.write(content)
; W5 N5 g# V2 ]! _! y6 |5 m - f.close()9 m/ {9 E2 D- g: T1 c: E& W9 Y
- #获取url后缀名- o ^5 D$ y% t3 y
- def getExtension(self,url): ( d; p# M1 k" W1 @
- extension = url.split(".")[-1]5 m, X) F4 k- R. j4 X, m
- return extension 4 Q: w( J' X+ h; j$ f% w
- & s) s7 O$ N$ B0 X/ ~* `9 p. M
- #获取图片名
5 w) l4 A9 q% j- g& B+ i U - def getname(self,url):
9 }+ [7 q5 b% J& t/ J* }5 E7 A - name=url.split("/")[-1].split(".")[0]
; H* {: A7 u% M" a - return name
复制代码 & N, ?. _* o4 l
( \7 O8 e, j+ f; P |