Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
, }% l9 `) D0 [2 t- #!/usr/bin/env python
$ l H% q8 `, g" j) K0 E5 M( r% @ - # -*- encoding: utf-8 -*-
, I# f' ]. ~$ _+ O - # Created on 2019-05-05 21:43:11' P2 D6 t0 C; _: |/ M% `+ Q
- # Project: XiaoShuo
% G( o/ W3 o; D -
; z3 c: C0 @& [8 X/ M) C v - from pyspider.libs.base_handler import *
: c2 C, h7 K4 U4 P# J( O - import pymysql
* v3 l `# H1 X7 r8 [: ]& K0 ?( @ - import random3 w) N! ~1 [, z# V8 {
- import datetime* q) X B$ j( P% v. a* e6 |
- import urllib2,HTMLParser,re
5 I. u5 z$ Q _2 N% r' D6 S# p - import os' e% {$ Q1 I& `, F G8 l
- import sys
; K: n, Q6 f* I$ R7 I3 q6 N - import re1 D: D9 l1 _9 X9 O$ v. o* w& {" }2 h
- import codecs
* |; x, l- _" t Q - import requests
& R7 t- _4 Z$ u E: W4 K( U% G - import json1 c. ~ N: N- ~) R/ F
-
- c; }6 Y) Z! J2 t& R - class Handler(BaseHandler):8 O$ m) X% j' O0 b# n5 f. n
- global Datos6 x5 I2 ^& g% v( z
- global P_dir , R% {3 [ w1 I$ q
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
3 S2 W' r+ @. A/ w: b7 ` - global Datos. N4 [- k4 |% j( Y! z
- Datos = {}
0 _2 L( I0 i% [6 R# ] - headers= {( L: \) ^; ]- Z/ x8 y# ~. A- N
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',: T' c% \: N2 ^" e
- 'Accept-Encoding':'gzip, deflate, sdch',& t9 s' j, R4 T, ^( n
- 'Accept-Language':'zh-CN,zh;q=0.8', Q8 O4 y! ?0 @9 Y: X3 T& Z5 p" Z
- 'Cache-Control':'max-age=0',
8 y9 {" p1 F8 V - 'Connection':'keep-alive'," c% {3 O8 m8 Z
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
) v6 O. V9 Y' R - }
, x! L; D% K' G8 q/ H0 J - crawl_config = {5 Z: I3 v" n% P
- 'headers' : headers,
! E% }' d p1 U/ W( {( s - 'timeout' : 300
1 I0 t0 b# L. k, Z - }" ?* o; d/ K. f6 f6 I) U
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):0 q- |! i, B7 h/ J9 l% g
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")9 W4 C: Y; E( P# \7 y# O- f6 ^
- try:
: S. A7 e5 b4 E2 F: X - cursor = db.cursor()
5 S2 K" ^$ A- |; p- h- a - #注意此处字符串的占位符要加双引号"%s"+ z4 {6 J( T7 r: R+ x
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);- G, S1 C$ q {$ n8 B
- # print(sql)
- ]2 ?% l _. x9 i - cursor.execute(sql)3 u! l) \1 X' j; _& z" V
- 5 c/ I) l# z' {1 a( m0 P3 E* C
- #qid = cursor.lastrowid# s+ C' ]) P T
- #print(qid)
0 R& j& x5 {. V, F7 v" V5 L/ f0 c - " {* ^$ r# y" a( V2 @+ Q
- db.commit()$ I1 g0 Z9 e: y1 I% h6 d
- except Exception as err:3 \" \# p( k' G# [9 z8 y* T
- print("Error %s for execute sql: %s" % (err, sql))
! } o# V r' \7 N/ h4 @0 A4 i1 U - db.rollback()
; L% o E* Z& U% _2 l; r. b0 u1 z - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
1 D1 I) E3 i I! _. u# g7 d - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")# C8 m1 p; e* M7 H6 f4 L$ Y; g
- try:% p7 `4 z8 y! K1 b+ Z r' @
- cursor = db.cursor()8 N5 S1 B' h! A; I9 i# K
- #注意此处字符串的占位符要加双引号"%s"
1 N( v: d% Q9 r7 [/ L' ^. T- ^; ~* l - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
7 J5 S# q& |! `2 `& { - # print(sql)
3 f2 i6 @7 l. z - cursor.execute(sql)
& y0 P% K# [ |9 ~ -
" Z, g9 @/ l4 B& n - #qid = cursor.lastrowid- S- J4 {8 Z5 M( I. v3 i- m
- #print(qid)
& ^* b; Y: y" q) |. q8 G- u& T4 w - 4 [ h4 [, N+ S+ [$ \6 K6 Z
- db.commit()
: i% z/ z8 C4 K# c - except Exception as err:/ h# i4 u! X$ C/ {6 B# g$ j4 \& G* W
- print("Error %s for execute sql: %s" % (err, sql))
% A7 ` p) l3 z9 w$ U7 F0 H I - db.rollback()
1 l0 n- d! R) p5 u0 S - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
. X3 ?* P* W+ n6 K- J# P - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")3 P# F8 o4 j5 `- H- r8 D, m$ A" ?) y
- try:1 p# ~# L- P5 S% _
- cursor = db.cursor(). ^1 u1 J' B0 ^; L' A4 K
- #注意此处字符串的占位符要加双引号"%s"6 p9 t" I# l8 a% C4 R
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);7 D X5 U. I+ U/ J( B
- print(sql)
8 Q# J' S$ [9 c( [9 \ - cursor.execute(sql)) D2 |, ^, L+ R3 g" ?
- print(cursor.lastrowid)" B. y: L; O1 v( H! _. s: I
- db.commit()
* \5 [0 X m$ W; ^0 q - except Exception as err:" m5 G4 t4 R7 y, f& E
- # except:
9 v' Z: Y1 _9 S! R3 ]8 } - # print('Failed')( r9 W2 [9 |* g+ N" s* [
- print("Error %s for execute sql: %s" % (err, sql))
; g8 n4 G* z7 _ - db.rollback()
; Z1 M, x' S$ }5 J3 B -
: Y2 G) g6 O9 H* p - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
% p. B( u* Z8 @8 W) L# d9 { - reload(sys)) ]- V- `+ G/ t+ E% S
- sys.setdefaultencoding("gbk")
$ w5 J3 ^$ S+ m d4 T. g - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
8 B7 ^# \ J. k g) D+ r5 Z - locoy_data = {8 b' r# y. W/ @# s; K+ |/ w8 X2 w8 W
- 'my_u':'用户名', #后台用户名' S% ?' L) W- f( [+ @8 o3 ~6 ^
- 'my_p':'密码', #后台密码" W! L, b6 P2 D6 ^, ~+ m
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
7 Y7 J+ G; T+ P m- b0 P - 'caid':Cater_Name.encode('gbk', 'ignore'),
8 I5 o3 B; k% O. l5 U - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),2 x& w: ^2 R3 ]
- 'article':BookConte.encode('gbk', 'ignore'),
% ?" d; x! c3 V - 'author':Book_author.encode('gbk', 'ignore'),
! T8 h f, H$ x( C- P - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),0 B, v6 B* R( G4 s7 O; a
- 'thumb':Book_img,8 r: K/ _ k0 Q8 P
- 'content':Book_Introduction.encode('gbk', 'ignore'),
! A0 R- {1 y- U7 l& }/ n) E - 'abover':abover.encode('gbk', 'ignore') 6 B5 G* b* h4 O/ v( H
- }5 o) e/ E( }1 g7 F v
- res = requests.post(locoy_url, data=locoy_data); R, @9 ^* G! Z
- print res.text3 N' F8 f+ g! s+ U8 U, ]8 T8 P# T
- print res.content3 M9 r9 C/ x" D/ |7 M! E+ b" {* r
- # print Dsd& W% E" O1 t. \. T5 `/ v
- return res
' b/ f8 ^$ X1 j+ Z1 ]: B -
( q8 C$ \" ?( f8 g( l - def __init__(self):7 o D- A' j, L8 H6 Q: K
- self.base_url1 = 'https://www.****.cc/'
# e8 d/ @- s4 x4 E& V - self.base_url2 = '/'% k+ j7 U' e* `! I( e6 j% F
- self.CaterId = []* }* K( c2 d0 F, ?. r5 o0 P
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
3 i+ P! k; Y. J; d: ] - self.page_num = 1' t/ U9 m! s9 f! a/ N
- self.total_num = 200 # i2 q1 O2 W# Y& |' Y0 i" s$ p/ m
- 5 t0 R1 b c& g4 }7 ?
- @every(minutes=8 * 60)
& ^& c4 t: o& u8 `; k4 V: o - def on_start(self):
/ H: N4 E% b6 W- A7 N - global Cater_Name
4 `) }- |; e- Q - Cater_Name = []
) g" m$ D% m' d7 c" k - while self.page_num <= self.total_num:
+ K) w1 b% P. W1 H; L - for self.CaterId in self.CaterIds:8 ~! ^, E' D2 c7 v- l2 r9 v
- if self.CaterId == 'xuanhuan':
& \5 H3 d& X8 }! e$ q - Cater_Name = '玄幻'
! N4 q c3 o/ Y; x8 X* _/ G - if self.CaterId == 'wuxia':* s3 W5 I/ @9 O$ O
- Cater_Name = '武侠'0 r9 H" A8 T: H) L8 N8 r
- if self.CaterId == 'lishi':2 n8 |" N, P( r ~+ x) t5 Z" X
- Cater_Name = '历史'
T* V* b, i }' U6 j/ n/ H - if self.CaterId == 'yanqing':5 W( Z* K5 R% G8 O# }/ V
- Cater_Name = '都市' 9 ^' _" q; ]2 o3 F6 L: o% c
- if self.CaterId == 'nvsheng':
5 c( j h# y' e8 U' ~ - Cater_Name = '都市'
4 ?" B# E( _/ L% B! j, L - if self.CaterId == 'kehuan':/ Z) S; Y, Q6 i3 ~8 R2 }
- Cater_Name = '科幻'
" ]7 P! e' l j: M3 f1 K - if self.CaterId == 'kongbu':3 U" a7 U& L& _: B
- Cater_Name = '游戏' 4 w( b: F; F' M& H9 a Z
- print self.CaterId
# ?% H( J; t. v; Z6 H1 @4 F6 [ - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" ! ~2 p9 w2 g$ a( w& g1 T2 P1 Z
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
2 r& `, z& j7 m$ @3 a n# H - self.page_num += 1
% h# x# o8 }0 S0 |: _, ] - / W# d* S2 O1 P3 k; y
- def list_Caterg(self, response):& {% N& o0 @1 I( y8 E
- Cater_Name = response.save
0 ?5 i3 D$ T5 `% e - for each in response.doc('.pic-list a[href^="http"]').items():
/ q8 l5 B4 h) x Q0 O - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
* J0 _ V" b" o+ c, C! R1 T4 | - ) u( R( Q+ J9 l4 \% L0 C5 l4 C' m
- def list_Caterg_detail(self, response): A w* f& c" X: s+ p% X- M
- Cater_Name = response.save, b: N; t" f2 Q5 Q
- # print Cater_Name
4 e2 D0 A, N4 T# Q/ ] - Bookname = response.doc('h1').text()
; V2 k: m: h. X+ O2 k8 s - print Bookname5 O- L6 i5 [! b6 ~' g/ W5 V) |8 f
- Book_author = response.doc('.authorname > a').text()
C2 T' m' ]4 q7 } - # print Book_author
% s% t8 A2 l2 I( `+ `& J' ` - Book_Introduction = response.doc('.book-intro > div').text()
4 J. E2 A8 t9 [9 f2 ?" z - # print Book_Introduction
0 E% ~* |0 t: R! _7 B5 G - Book_Synopsis = response.doc('b').eq(1).text()
7 f- P% l! ]: y: ? - # print Book_Synopsis
: ]7 `+ O1 q2 X5 k1 F6 @ - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
5 W* l; }* K% s8 W B - # print Book_Palabras! k7 N$ ]$ n# I4 G
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID$ w( l- h. T' e; i- M; P8 e! U" I: N& ~3 w
- # print BookIDs% H7 a$ u5 @+ N4 `
- Book_Dates = str(datetime.datetime.now()) ; q) i, n6 g$ h' o7 ^5 J+ R# f2 ?
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
/ V* G5 N) J! _1 W! v - img = imgs.attr.src: z1 Y$ Y/ W# b0 V9 ]
- print img: w1 i8 v. y7 u* F& C# y7 S& [& W
- #小说封面下载1 R( _) g; B6 h& s3 X
- extension = self.getExtension(img)
2 ~+ k& p4 m* I4 c0 |& T* U - name = self.getname(img)
( M) c9 V9 S3 w1 _ - file_name = name + "." + extension# S4 v! u5 h* A# J1 r7 Z
- imgDir = P_dir + name" l; G# m0 p% E2 x. R2 ]+ m
- Locaimg = imgDir + "/" + file_name
, s, v5 y& G3 y/ N - print Locaimg
/ y' ^3 b& q1 h6 `% Q3 G4 ^8 q# B, ? - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地9 o. g! Q) b: R3 E8 \
- print('attachment url is ' + img) #
0 l! U4 |1 j8 Y! N - Datos = {" w. k0 x t- E# Y! G- B+ f: G9 `0 ^5 h
- "Cater_Name":Cater_Name,* x, G( Q. R3 U+ `5 A8 n
- "Book_author":Book_author,
# r8 u6 \# `& Y6 q - "Book_Introduction":Book_Introduction,2 o' ~/ `: S& g' k
- "Book_Synopsis":Book_Synopsis,
3 y* e- c" u, G, P - "Book_Palabras":Book_Palabras,
7 i! J$ f) v* @) L - "img":img,& h0 D6 T; m$ e( Y2 G
- }2 u; s' u. j7 I; w. E0 @
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布9 g- @; Y3 G( x- k) E+ s
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items(): V: c# s3 j+ Q$ ?" |. t, J3 ?& C+ L
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
3 o8 Z5 q& X) r% A! B( o. _ -
0 x4 {/ i, ` N( J' r: f - @config(age=8 * 60 * 60) - J) N+ l3 j; |5 `! @+ E+ p% W
- def index_page(self, response):
8 k J- O/ a4 @7 u - Datos = {9 [' d# `' W( C, k
- "Cater_Name":response.save['Cater_Name'],
( m, w a. j1 |9 V ^2 [ - "Book_author":response.save['Book_author'], }/ n7 J2 e# g' N `. Q, F% G& I
- "Book_Introduction":response.save['Book_Introduction'],0 i& o! V `( J* s9 G
- "Book_Synopsis":response.save['Book_Synopsis'],
( v6 L! l- q# K5 T. Z5 h - "Book_Palabras":response.save['Book_Palabras'],
' a u+ ^( D, j! v1 q! V$ m - "img":response.save['img'],
f5 q. {' v, z - }& O6 [0 R6 m! D5 ~' r1 Q Y
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
+ h, r8 ^* m# Z2 H& Q - # for each in response.doc('.chapter-list a[href^="http"]').items():
# r3 [! `# c5 x3 j2 [( Y - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
0 h' Z4 d* I4 B6 h% o8 v# B - @config(priority=2)) _& a" d+ J- i j( {0 A R. Q2 E
- @catch_status_code_error
) A4 j8 x2 {+ t1 X% U9 X - def detail_page(self, response):
5 Z2 |0 z, |; E% r1 R - NewRe1 = u'哈书'# {$ m P. m) q/ M/ X
- NewRe2 = u'huhjsd.CC'
/ b$ Q g2 X {0 a- [ - NewRe3 = r'^\\n\\n'
0 O# m: J$ U& }) d4 t - NewRe5 = u'小说网'
8 ]3 |( v$ x% \; i - NewRe6 = u'fgdfgf'
" d+ ~0 O S5 p" G1 q y, m - NewRe7 = u'fgfgf'
; G9 d* ~! c3 z! v J$ d0 K( a0 n, m - NewRe8 = u'ffhgf'$ D# E! u6 k1 _1 _+ h
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'( K% @. S& ~, `: V, x7 r
- ReC1 = u'静思'4 s% j N5 F. @! ?
- ReC2 = u'aghgf.com'
! K6 p( g/ ]; J4 c# \ - ReC3 = u'aghgfh.com') `$ F) r6 g @. G/ V7 H4 k
- ReC4 = u''( w |0 z; U! K$ j4 `
- ReC5 = u'文学网'
b) v5 w1 R! @9 m+ T - ReC6 = r'<BR>'
1 _3 j: b$ ]! ?7 w3 ~$ l - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称* i+ I4 s v/ n; b
- print Bookname
* n4 _+ w* V; ?$ v& o; [3 Y9 o - Cater_Name = response.save['Cater_Name'] # 小说分类
" z6 ~( j3 ~" I I6 y$ A - Book_author = response.save['Book_author'] #小说作者
/ U5 k4 n) }# N+ M - Book_Introduction1 = response.save['Book_Introduction'] #小说简介
4 @; O% y2 i" b4 z9 [ - Book_Synopsis = response.save['Book_Synopsis'] #最近更新( \6 m0 S j X+ { E6 f% O4 W
- Book_Palabras = response.save['Book_Palabras'] #小说字数
9 [7 r+ w4 g; u5 b0 k0 y2 ` - Bookurl = response.url #小说网址: }$ X. k: A% t& k) z8 w
- Booktitle = response.doc('.article-title').text() #章节名称9 \% T8 C3 @) ~8 o6 L
- BookID = response.doc('.readset-r span').text() #小说ID
% ?( @6 X4 H' i& C( _5 q - BookConte1 = response.doc('.article-con').text() #小说章节内容
2 M. o9 \% Z) C9 c7 M0 ~* ? - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
3 r% J7 N+ I5 a- u - Book_Date = str(datetime.datetime.now()) # 采集时间" p3 ~4 D3 V% ?
- BookConte2 = BookConte1.replace(NewRe1 , ReC1)
9 w% b. F$ z( y - BookConte3 = BookConte2.replace(NewRe2 , ReC2)6 ~! Q l3 t2 S8 N; @3 y
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
+ o* {1 q/ K* g, Z5 _$ R - BookConte6 = BookConte5.replace(NewRe6 , ReC2)
* J1 o" Q" u3 E - BookConte7 = BookConte6.replace(NewRe7 , ReC2)/ ^7 s/ S+ X4 [, }
- BookConte8 = BookConte7.replace(NewRe3 , ReC6)2 h' a" X# Q3 F' b0 R
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
( V" ]% F u, c" q1 [; k - BookConte = BookConte4.replace("\n\n","<br>")% N0 B$ M I" ], C/ o( w( X" t
- print BookConte
& [* M+ X5 c( V, ~8 j! A - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)( Y0 S. @9 m# B0 e, v- }
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
6 O8 Z' L0 ~9 u3 m5 p& n - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)" }& M Q8 K8 a2 m M' v5 v
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4), O: ^' s' w( e: T4 _5 T8 s- S5 z6 b
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
, b7 N: H; |( d - Book_img = response.save['img'], #小说图片
. p6 k0 U" ~) g5 z) j+ Y3 r' L -
: W2 P) b. N$ W# ]# E) E; G+ Y# F - #insert into MySQL 小说入库
; }, P2 P% V! \8 u - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
0 F% \: h' E( J6 ] - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布1 Y5 H l5 r+ T V2 J* H% H
- #post提交发布% R" l. L5 @: ]; T7 ?* Z
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消* a4 B9 E+ W& u8 k
- Datos = {
0 Y: [% B9 y2 J8 r' O - "Cater_Name":response.save['Cater_Name'],5 `3 i4 p* |9 D( U1 d* q1 z* l
- "Book_author":response.save['Book_author'],
9 a9 f8 \# ]5 G5 K6 h - "Book_Introduction":response.save['Book_Introduction'],$ w7 ]7 D3 U& b8 U% u
- "Book_Synopsis":response.save['Book_Synopsis'],3 V& L/ x' ~6 E$ [3 g3 V
- "Book_Palabras":response.save['Book_Palabras'],
) v3 c- f6 U& D - "img":response.save['img'],
. O2 e) }& i) }* D - }
. f3 Z! z5 [' O! K7 j. m3 i - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
) r/ E% _; k! v4 z: [- z+ h) k0 `( e - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
/ q* j0 G' N8 j5 C6 V; m - return {
+ D, c q1 s7 ]2 ~% \0 _+ R - "Cater_Name":Cater_Name,1 M3 X& M2 @$ r. A
- "Bookname":Bookname,
5 J* D; `" b9 y- h! i4 d d - "Book_author":Book_author,+ X* j$ ~0 G/ I$ _3 R# P: P
- "Book_Introduction":Book_Introduction,
+ Y7 z: a1 |5 p* r$ H1 y - "Book_Synopsis":Book_Synopsis,
) h/ ?: `4 _& ~( k! I2 k - "Book_Palabras":Book_Palabras,, N% l k3 m( N# n& i( G+ j# l3 |
- "Book_img":Book_img,7 [# ?. c2 o' ]
- "Bookurl": response.url,4 [5 m+ S# V6 x# D0 U; Y
- "Booktitle": Booktitle,
$ ?9 c' i2 Z" D, @ - "BookID": BookID,/ I2 Z( u. P4 N1 X' R8 g
- "BookConte": BookConte," d! R5 Z7 {) h3 e' G% t, k
- "Titleid": Titleid,
8 M* e0 {* B& ^2 g8 q - "abover":abover,
3 V7 I8 V J: E0 ]+ ] - # "Book_Date" = str(datetime.datetime.now()),
8 p! |5 u1 Z, l" ]& U6 g4 F- v: x - }
* r' }" j6 M0 ]( F% B/ L - def download(self, P_dir, imgDir, file_name, Book_img):
* A& e8 X% J9 Q* p% _ - if not os.path.exists(imgDir):
% M3 g; y7 e5 F! E. }+ w. x - os.makedirs(imgDir)
- D& w: H2 K- }" @4 T, R9 N7 u& O/ X - file = imgDir + "/" + file_name
: ]# V* M! W N( n* f: B" ~ - # print file
! z8 q7 D: B$ X - f = open(file, 'wb+')
" f0 e+ x7 g$ R) A! c- J - imag = requests.get(Book_img) 9 n) Y" }1 H8 v0 K' ^) `, _. Z8 X# w
- f.write(imag.content)
" K- C% a' e" x; P - f.close()
- f2 ?: z5 s- l% _ - #保存图片前
" O E" i: C ^0 E, z - def save_imgs(self,response):1 u- v' W2 L4 L" s
- content = response.content$ h8 R1 f7 ]" I) {, y/ T
- file_name = response.save["file_name"]
+ f" M& U9 U' i. G6 p& V& G - imgDir = response.save["imgDir"]
5 r. z; l# k3 l/ ~2 }& D2 ` - file_path = imgDir + file_name
9 ?; _0 `9 H' V9 ]* q - self.save_img(content,imgDir,file_path)0 s6 e5 B" @, n
- #保存图片
+ l1 v8 v+ j4 e3 a* S/ h4 Q1 W - def save_img(self,content,imgDir,path):
9 U8 B- Y* ~4 U4 g. H* ^- [9 ]: c: X8 T/ \ - if not os.path.exists(imgDir):
2 Z, G3 s$ F2 {, k+ ~ - os.makedirs(imgDir)
- S* L% |" Z% M0 |2 j - f = open(path,"wb" )5 L0 V: x3 ^- Y5 ^. ^' A# j, g* O0 A- m
- f.write(content)
1 g' X/ y: `; S! ?1 M$ U2 S - f.close()
: \2 {# m2 q) c: u9 o( S - #获取url后缀名
9 f! E- T* n$ }, K3 J' [# Y Y/ j - def getExtension(self,url):
! o s' s; F2 d+ r( g7 ?2 v - extension = url.split(".")[-1]3 F& ?5 z* b$ ^; l' b D) D
- return extension
; @. X& o$ ^+ }/ f -
7 y9 U' L+ w3 Q# l0 L3 ` - #获取图片名
& s+ z V7 r+ b- v& H - def getname(self,url):
! V4 d/ R' N+ I# \; W9 [ - name=url.split("/")[-1].split(".")[0]6 }' _* Z" n; ~& x5 c
- return name
复制代码
9 M2 x2 P/ E2 P& l3 G. c " k. b8 X4 G7 _9 p( E& r8 Y0 L5 C# s
|