Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
3 K: R5 X- {+ u; C e+ y% R- #!/usr/bin/env python* X1 a; I7 g% R' ^1 t! d" I) r9 u
- # -*- encoding: utf-8 -*-4 Y" q& R; z$ p3 ^3 i
- # Created on 2019-05-05 21:43:11
# P. D/ N* T1 U0 U+ |7 |2 M, H - # Project: XiaoShuo
- G4 [ B* K' H) s& a -
; F$ @* y7 M( c) u J - from pyspider.libs.base_handler import *5 i i6 z5 @) ~4 E% V
- import pymysql, ]& ?2 X5 }5 ]) M0 Y
- import random
5 i, J7 l$ H) Q8 V5 {( J; z - import datetime
8 o, e9 B. M6 N* _0 } o7 c- D - import urllib2,HTMLParser,re! {7 a# @; Y, L
- import os
|0 X+ ^; H0 n0 w - import sys* q% g' \' ?( o" V }% c; d
- import re7 _4 G& [) v/ [" ?1 ]* @' I4 p
- import codecs0 p B1 T0 I1 k0 q6 ~& I
- import requests1 |" f) D) n: a8 X4 }
- import json
8 h6 G8 s6 b, w$ W& z* g - ! s; A. U! F- X/ _
- class Handler(BaseHandler):3 ]7 q/ V1 ~# x' ^ U3 `
- global Datos3 ~2 c1 Q8 A% F: E! M% v0 _
- global P_dir
2 _" H# w2 {8 {8 T' K2 R6 m - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
8 X* f$ q# S7 b; U - global Datos- g8 U" }7 P& Z6 ]
- Datos = {}1 ?1 p* |& ]( f0 Q6 r4 \( C
- headers= {
; e% w6 s$ C6 n5 s7 h0 h0 \ - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
P; \8 l7 U8 |- q% B/ \ - 'Accept-Encoding':'gzip, deflate, sdch',
2 v8 s" n" X5 e5 X0 {. a3 G - 'Accept-Language':'zh-CN,zh;q=0.8',
$ T% q2 i+ v4 N3 [; ] - 'Cache-Control':'max-age=0',! l/ h2 N, I' R8 h
- 'Connection':'keep-alive',% n8 M& W7 j: J M4 }
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
' p( Z; o; V; O& s - }' D+ \" h1 c: k
- crawl_config = {
: f* \0 W% V% ?9 O3 l - 'headers' : headers,% B' s ]- h; Z: F S t N7 p
- 'timeout' : 3005 F3 Z/ p7 K d; x
- }
# I0 T+ O6 Z% ]5 Q8 f1 E- | - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):$ W2 T& M/ v- Y1 C; r
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
5 L/ Y! N5 _+ d2 H f& X2 }+ `8 P( n - try:
9 ?, X) I4 q/ M: }, x - cursor = db.cursor()( H9 W \( f7 l0 K" E$ F& P; s4 t
- #注意此处字符串的占位符要加双引号"%s"0 X' K. s2 S/ a$ N( M x
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
% D, d% L6 X; s4 ~# W W3 S& ~- `3 X - # print(sql)! V: m9 j- M2 L' A! g) v
- cursor.execute(sql), [% r& q9 K' }+ \7 _
-
: [; @8 y- u) p5 V( ]9 _ - #qid = cursor.lastrowid
1 y2 ^) e3 f. C- s( L4 F G4 I - #print(qid)# c8 w! V) h( r3 F5 H- s; t( ?
- 6 l; H% h3 P- a, B$ e
- db.commit()6 }" T( z& t0 p' B, c* H
- except Exception as err:
7 R K x0 P$ R. h3 N5 z3 q- H7 B - print("Error %s for execute sql: %s" % (err, sql))$ j, m5 Q$ N: ]1 T- i
- db.rollback()
' `) a6 [2 g' B" P( E! h: | - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
/ F% g; z: I" n \ - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")5 }8 w" y( v" z: x7 I
- try: l! o# @1 e& ~/ b5 ?9 L A
- cursor = db.cursor()
' t' W' b- v: {& t @- ^ - #注意此处字符串的占位符要加双引号"%s"& e* t1 e3 V9 E0 s# w9 Z y
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);. L; g. h2 w3 `' s, H) N6 `8 L) M
- # print(sql) P( a* T+ _( Z2 ~1 m( l
- cursor.execute(sql); A7 B( O! S1 \1 j& o7 @! l; {
-
2 D: `; n, q; S$ ^0 q - #qid = cursor.lastrowid7 l H- b" z) B5 } ^7 W
- #print(qid)
% n E# d$ j9 I: N) b( `. ~ k - " B$ [+ Z$ t8 d# H) p# F; z# Y' h- @
- db.commit()5 K8 c7 W! ~, c9 p- X5 P5 c% s( `$ s4 G
- except Exception as err:
, P% E1 W8 D/ c4 F- e - print("Error %s for execute sql: %s" % (err, sql))
" e M! [8 u$ s( [ - db.rollback()
7 N, o/ C8 T! o T2 n - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
. F5 O/ z9 i% m6 S! [! \ - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")8 R) x" M- U/ w' ?
- try:% |2 [" N5 ]+ I9 t
- cursor = db.cursor()
1 V1 q+ `/ X! o5 @- E! C% C% c" a, Y - #注意此处字符串的占位符要加双引号"%s"
1 j- f4 K* p! ]# N, k* f9 m. Z- X# Z - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
+ q; l) t. w# l! g - print(sql)
5 U/ f. N4 p$ o; M: E# n - cursor.execute(sql)8 K0 Z7 h3 L" T0 P% L
- print(cursor.lastrowid)
8 L* h/ ?% {# K7 ^5 D9 G6 O - db.commit()
# }: |) V. r7 K+ ?# U: z/ L - except Exception as err:1 E4 i6 E5 x: k5 r/ D$ {& C
- # except:
3 E+ D/ O7 g3 c5 N* r% h. P - # print('Failed')& F; R4 n3 q7 \# f" T- }9 C9 k: n
- print("Error %s for execute sql: %s" % (err, sql))! c% v; p- W$ k) {, _$ m6 D2 P
- db.rollback()
0 |( x' ^ w4 A$ v) ^& u -
0 u/ I# H0 M$ Q8 ^& w+ s7 X - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
2 [% g9 f6 v; Q" r+ @7 y2 p - reload(sys)3 Z8 O! s X' d! r
- sys.setdefaultencoding("gbk")# A' U0 h7 S% V. i
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
b7 J E: G$ \: v4 a% V! Q- p - locoy_data = {2 @ R$ j7 ?0 @
- 'my_u':'用户名', #后台用户名
3 s+ N* j8 ^# m& z - 'my_p':'密码', #后台密码# G' p9 q+ [- c& T, M
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),# M( O- {) x- f- [
- 'caid':Cater_Name.encode('gbk', 'ignore'),
7 E! d( e& O! {, `8 m3 l( A4 M! p - 'title_669977_net':Booktitle.encode('gbk', 'ignore'),! ]# d9 P( m% i1 K8 n
- 'article':BookConte.encode('gbk', 'ignore'),
% J; q% e* U# F8 H4 Y - 'author':Book_author.encode('gbk', 'ignore'),
' y& w! o$ z: w! L4 y - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
! T# g' o* M. O. {7 e - 'thumb':Book_img,
8 Z6 b5 M( T6 @5 [9 b - 'content':Book_Introduction.encode('gbk', 'ignore'),
; V y" b8 p0 v# L) Q0 x - 'abover':abover.encode('gbk', 'ignore') . O/ p+ L; ^" a. D
- }, f. ^- A( @$ A
- res = requests.post(locoy_url, data=locoy_data)
+ j5 E! J# S: w$ M - print res.text
p% F7 W% y) a: S6 z - print res.content
$ ]& `, k# A& u: n: @- C4 F - # print Dsd
0 P5 a2 H2 E& x0 L0 R - return res0 `8 t% `. P% [1 R0 X. q: _- u
-
' A6 x. G% r* z. | - def __init__(self):4 g( v1 A6 V/ g/ ]' s% \
- self.base_url1 = 'https://www.****.cc/'
5 U* f3 i, m `; ` - self.base_url2 = '/'' C9 o& ~1 m& n( d
- self.CaterId = []
' ]5 ~1 n. q) c$ U% i' x: B7 L - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']" @1 F2 ^0 ?) J0 w! U
- self.page_num = 1
- ^/ O2 d1 [' f& n3 V9 b - self.total_num = 200
0 ?( L+ I! j% Y8 ^$ E! a - ( `: N% ?( G& t1 t! l( R3 i6 C
- @every(minutes=8 * 60)
4 s3 ^8 x+ s# K$ n5 f - def on_start(self):$ f Z# W; e3 A
- global Cater_Name
3 |0 d7 c' U; o - Cater_Name = []0 }1 E, V3 T% k+ s, c' i
- while self.page_num <= self.total_num:
# _/ F8 z" L$ t. \% N - for self.CaterId in self.CaterIds:8 o; m* t" F) M4 {
- if self.CaterId == 'xuanhuan':. r! t( }4 |; `
- Cater_Name = '玄幻' v$ }! [$ _! _) G( b
- if self.CaterId == 'wuxia':
( [6 T" C' ~6 } - Cater_Name = '武侠'" S3 p) n4 t3 s, X6 \
- if self.CaterId == 'lishi':1 f# j+ B7 O9 `8 L
- Cater_Name = '历史' 8 V4 U5 B! U+ t0 E. B) M7 T& }
- if self.CaterId == 'yanqing':
. z7 ?+ [! `' j8 P - Cater_Name = '都市' ! G s/ V! T8 g2 n
- if self.CaterId == 'nvsheng':
, E' M/ `* ]. N. d4 X* _% F6 g j - Cater_Name = '都市' % X. k+ ~# G, c& Z: d+ M
- if self.CaterId == 'kehuan':1 E3 x, {5 G" @1 E$ @- h) c! p
- Cater_Name = '科幻'
$ f" ^. `% C9 t4 t0 s& P+ ~ - if self.CaterId == 'kongbu':3 z, Z5 p3 L! a2 | ^
- Cater_Name = '游戏'
! G- [* j4 U% m- v, R - print self.CaterId
6 T: K' [8 J" O - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" 0 d" w3 c9 d4 a* X/ Z
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)7 [0 x0 h( H1 a% K; j( F, n: `: ? u
- self.page_num += 1 1 E9 T, T2 @$ ~$ B& a4 p, _# ]2 p7 L! l
-
- Q% @$ a' }% h# `( i ~ - def list_Caterg(self, response):2 O# c' k+ ]* W, ]
- Cater_Name = response.save, O7 X+ a5 Y: G+ X! X* }4 ]
- for each in response.doc('.pic-list a[href^="http"]').items():4 ~ F& N$ I. L; ^
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
' @, |6 u I& h - 6 R5 t& y7 ]. Y9 J& y7 j! X$ Z
- def list_Caterg_detail(self, response):* R, X" C8 \) ^: G2 O# m
- Cater_Name = response.save
% b* d0 m0 i( G! g# w+ }9 h - # print Cater_Name: _4 c1 Y/ o5 v' w" l
- Bookname = response.doc('h1').text()1 ?9 f* R+ h6 l
- print Bookname7 ]8 `$ D7 j9 { Z/ F9 z
- Book_author = response.doc('.authorname > a').text() F* {( e% f$ ~
- # print Book_author
$ L( X3 {+ {* r7 o+ r - Book_Introduction = response.doc('.book-intro > div').text()6 Q f( s! `6 l% Z+ [9 A
- # print Book_Introduction
9 i; T; M7 p3 z! L6 G) R - Book_Synopsis = response.doc('b').eq(1).text()
0 b% p" l( ^" n9 q( M$ l: _1 W - # print Book_Synopsis
3 `7 ~' H Y: S3 }: A" ^ ]9 v' V - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]4 x8 p! k! ~4 R
- # print Book_Palabras
& l. d0 P1 H7 E' X( e. Z - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID1 z% s& i7 p) Q7 d9 l2 E
- # print BookIDs6 u1 M' x& o8 u$ n8 q+ l+ T# \* q
- Book_Dates = str(datetime.datetime.now())
! h. n- V4 P* L5 X& K5 a3 L - for imgs in response.doc('.bigpic > img[src^="http"]').items():
% q, H, p* M- V7 E - img = imgs.attr.src
( { y- d; `- O4 B - print img
- O1 m6 {- P) x/ N" l7 G - #小说封面下载
1 b; ^% m% v8 e" S" w - extension = self.getExtension(img)
* \3 K) j* z/ ^5 ? - name = self.getname(img)
( Z1 E1 @ T E7 g - file_name = name + "." + extension
# o' E' `' u! {' P - imgDir = P_dir + name c% T: v7 u3 ^1 v+ W; |$ |1 r
- Locaimg = imgDir + "/" + file_name
- a; [+ B. ^; @ - print Locaimg
" Q2 P# d* l, V - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
& Q( X1 w7 s+ g' m. D - print('attachment url is ' + img) #0 D. ]/ m& O3 w o0 A% h3 l8 c
- Datos = {3 k) U! c+ G. ]: y; n
- "Cater_Name":Cater_Name,. x. X5 [ z1 s ~) J$ m
- "Book_author":Book_author,
L' @! w* v( i - "Book_Introduction":Book_Introduction,( Y5 s% U: T* M6 F
- "Book_Synopsis":Book_Synopsis,6 w. T. }; M& L
- "Book_Palabras":Book_Palabras,
. j1 i" r, P ^' y - "img":img,
1 \' Z* L) b7 t7 B7 h - }
6 m9 k- p' a0 A# i# d - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
$ {0 B( i3 }7 J5 R! ^, V. O - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():) g. @2 `3 V* w3 o- Y8 X" e6 i
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
. }2 z% {! }5 W -
) F6 d u5 T) c. w - @config(age=8 * 60 * 60)
- g3 _* h8 W& D+ w - def index_page(self, response): 9 ] f) J0 y- ^4 Y3 f
- Datos = {: A2 `, s. \/ w1 P
- "Cater_Name":response.save['Cater_Name'],
2 [2 B) X% n0 [* O& R& S; C) O! p - "Book_author":response.save['Book_author'],4 D9 L! I, V: G9 f6 W- g# r
- "Book_Introduction":response.save['Book_Introduction'],
* e6 t4 x. k i+ x" p - "Book_Synopsis":response.save['Book_Synopsis'],# T) k" |4 I4 t$ \8 d6 A& ^
- "Book_Palabras":response.save['Book_Palabras'],' E" U( v# k- o, ~9 h
- "img":response.save['img'],
) A9 E4 q5 u& b9 ? k8 Q- X - }3 c7 V2 M, X1 b) V6 k+ A
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
& o3 r3 ]0 I) V$ c - # for each in response.doc('.chapter-list a[href^="http"]').items(): / Q0 a# [ ~0 ?) s
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)# e) W2 a Z4 I/ [8 W
- @config(priority=2); Q+ s0 o, A- `6 d
- @catch_status_code_error
; G7 `0 j8 h( p0 T - def detail_page(self, response): - k: P+ v' _! E0 L; V: o) z
- NewRe1 = u'哈书'0 U' G# Q8 F9 E) {5 O+ W
- NewRe2 = u'huhjsd.CC'
4 b( E" ]3 d! o - NewRe3 = r'^\\n\\n'
, B3 `! v, V3 M" F* _6 [ - NewRe5 = u'小说网'% @& k% o$ c+ B# c
- NewRe6 = u'fgdfgf'
1 V. t7 c: r. T5 i, P# L6 } - NewRe7 = u'fgfgf'
+ x$ a3 d7 L8 L6 s - NewRe8 = u'ffhgf'
" M z x3 |" I/ n g - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'1 j$ X) P' G4 i
- ReC1 = u'静思'
, h4 g! i8 }/ C - ReC2 = u'aghgf.com'$ H# s- q3 _" x4 o
- ReC3 = u'aghgfh.com'
0 U3 R9 { \' K$ H9 u3 t: d9 v - ReC4 = u''
9 }0 H) W' V) d+ }- _' }, u* V - ReC5 = u'文学网'/ T& {7 A7 Y8 Q. z8 q
- ReC6 = r'<BR>'( {. Y" H" }( N8 ~" {8 Y
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
) B+ T6 c& n- U5 H5 n, \+ a - print Bookname: A/ v' {; _) T6 o/ t2 D
- Cater_Name = response.save['Cater_Name'] # 小说分类
8 F8 O. S. S- C& g% ?1 M6 Y - Book_author = response.save['Book_author'] #小说作者+ [! P# _+ d: U, Q
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介3 C- E5 A& H& i: C( i9 r
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新3 |4 O+ Z* j) }7 M) R7 v9 [; F5 Y
- Book_Palabras = response.save['Book_Palabras'] #小说字数- j& f: j* ^8 x) ?
- Bookurl = response.url #小说网址
4 G/ D1 I( Q, Z! f7 n/ ` - Booktitle = response.doc('.article-title').text() #章节名称
( O3 H4 u% b, V* L( t. }/ o/ U, Z* E - BookID = response.doc('.readset-r span').text() #小说ID4 \& H( v9 N9 A* i" q! B
- BookConte1 = response.doc('.article-con').text() #小说章节内容/ P, N2 F0 q$ z8 P- E% c+ g
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
& E, b. `# k \7 q - Book_Date = str(datetime.datetime.now()) # 采集时间
6 V$ u0 o/ o$ e1 A, F, p - BookConte2 = BookConte1.replace(NewRe1 , ReC1)) F- u) O5 F( |# _
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)& T3 ?! e% I" o- i1 P
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)0 ~8 H; i4 |8 {: m1 U% u+ P- |
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
# J: d5 V- Y2 R, m8 Z, E6 M - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
+ ?6 k4 b5 e( y* j. c! h& U9 s: ? - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
. Z+ ^1 }: J' F- ~2 n - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)! E- F: i& e- p* J0 X# T/ |, w
- BookConte = BookConte4.replace("\n\n","<br>")' h+ Y b4 d i
- print BookConte/ B; x; x1 E! P( X9 T
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)7 _& ?1 o! d% K4 s0 m; S7 l
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)8 R, ^; E9 ~4 j
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
, @" `9 {3 w Q- _) f - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)3 a' j8 t* ~ |+ v
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] : }6 g9 O- K! A F4 j
- Book_img = response.save['img'], #小说图片! Y, |5 T; r$ W. m4 k
-
+ N- S1 }2 j L* G. @ - #insert into MySQL 小说入库/ u- t! F x& H }1 J8 ] ^. k$ m& v
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
% K9 U( i/ p* S5 l; D6 y# y. [6 L' b - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
1 k3 ^5 U+ g( d; O6 }, k - #post提交发布
& S1 P k1 ?6 s K' e+ B8 O - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
$ ?, a) ~! m8 P; t: E - Datos = {9 _4 g3 I$ T4 ?1 U
- "Cater_Name":response.save['Cater_Name'],
, G% F" i* V4 u - "Book_author":response.save['Book_author'],
+ n0 ]; ^ Q( [# L$ |8 D - "Book_Introduction":response.save['Book_Introduction'],. w9 m# w# }& V
- "Book_Synopsis":response.save['Book_Synopsis'], S; e& f/ l" x' C+ M1 D
- "Book_Palabras":response.save['Book_Palabras'],
! i: e7 ~) H3 _3 ` ` - "img":response.save['img'],
# |3 M9 o% ^9 J2 R: P; Y& n" x - }
8 b+ P. _4 D$ ?3 w" X8 C6 [1 V! r - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():! \* B) d, T% G. q( l" F
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
# u5 X! \) S2 A' T! b - return { s" _' ?7 o+ o# O
- "Cater_Name":Cater_Name,3 _) {2 }% Q2 C, e, Z3 Q2 S
- "Bookname":Bookname,! z* ^0 @" x; F+ [
- "Book_author":Book_author,5 ?# `4 g% H' v+ W
- "Book_Introduction":Book_Introduction,
$ C C8 N' U( c) \ - "Book_Synopsis":Book_Synopsis,0 J9 m f" `7 d4 j: G
- "Book_Palabras":Book_Palabras,
0 n+ v# s, g( k7 F - "Book_img":Book_img,6 ~, H+ Y' X% L
- "Bookurl": response.url, N; G# u" y$ r. _, \; z0 T
- "Booktitle": Booktitle,
5 C# e) Q! M8 s0 j& L, V2 D' {9 N - "BookID": BookID,
' \3 Q: M/ Y$ m6 C$ |, G' D: \3 G K - "BookConte": BookConte,5 m" a7 Y0 c) X# y* V7 e1 }
- "Titleid": Titleid,
- i: q! N/ }. Z - "abover":abover,
) j+ F. ]% M; ^/ @ - # "Book_Date" = str(datetime.datetime.now()),
4 y- |# r% F( Q1 l, V - }1 Y3 A N; }, v2 X$ d6 k
- def download(self, P_dir, imgDir, file_name, Book_img):# ?' Y8 S% z# l) M! i
- if not os.path.exists(imgDir):
1 r0 S* x+ \! [6 o- s1 q) E5 ` - os.makedirs(imgDir)
" F- u- g7 \. z& n' R1 e/ n - file = imgDir + "/" + file_name
. w. k8 J) I) N% r1 a - # print file
) N6 f, z* `$ d) N - f = open(file, 'wb+')
, f, Y: F) l+ F2 R, v& e. Z& N - imag = requests.get(Book_img)
v8 |9 u5 T/ b( n; | - f.write(imag.content)
/ B) O- Z! x4 }) z3 b6 Z0 @* \# A1 Z - f.close()
! n2 O) _6 \/ j2 ^1 o+ q - #保存图片前
6 Y! g7 c7 u2 s- y# f: N! S2 b7 h - def save_imgs(self,response): S5 @, j/ |6 y7 B2 g2 ~( T
- content = response.content
% ~8 d* k/ p% T r; l' { - file_name = response.save["file_name"]" G6 x) C+ G( @, h" ~
- imgDir = response.save["imgDir"]3 C7 g8 N7 W* ^( N0 X% |" S: J
- file_path = imgDir + file_name: U; A5 f; F& o& F: F9 Z/ O4 B
- self.save_img(content,imgDir,file_path)
9 C* X, [( E q# N/ Z - #保存图片
6 R- W7 m2 C, ^) E. A' f* t) ? - def save_img(self,content,imgDir,path):1 H1 Z* o9 g: c$ ^9 `' x9 q; S7 C' m' F
- if not os.path.exists(imgDir): & P$ g/ M! Y& J/ Z$ D9 e
- os.makedirs(imgDir)
! }$ X/ L: Q, e% ]+ r, D$ x$ \ - f = open(path,"wb" )' u4 L% k3 F! {1 ^( ~
- f.write(content)
: ? V$ E. ?: \* S: g - f.close()
' x- O% x/ a& M* i) X9 G - #获取url后缀名
: S& x8 e; [) ~& F2 {2 @) m) y9 T - def getExtension(self,url): 6 Y5 [/ ~% y2 O& z( a# N
- extension = url.split(".")[-1]
+ g" R3 \ D5 m" g6 L* A - return extension
" E" J4 q5 Y' N) H* e - , b$ l3 `1 y8 }) B* D6 a. G
- #获取图片名4 Q* g9 p" B; }' T. S7 Q
- def getname(self,url):8 w/ o6 g# c8 u. e5 b4 z& ^% m
- name=url.split("/")[-1].split(".")[0]# d2 s6 I7 V5 I- B% V* Y
- return name
复制代码 7 @& f, y& r2 ^% S3 `; o: G
5 u- d D& Q. V& U
|