Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
2 `/ ^' N- C/ A7 S- #!/usr/bin/env python
3 `+ ?' X$ E$ c3 c+ e; a - # -*- encoding: utf-8 -*-
$ i$ v0 H) v Y% j - # Created on 2019-05-05 21:43:11" _: W: ~# o1 y8 G
- # Project: XiaoShuo
! f, a9 @) W' f. c1 A5 L - : S+ N7 c( y" Y
- from pyspider.libs.base_handler import *
6 C% y9 k3 c' W7 S3 r2 h - import pymysql
: H2 i+ m" J" H8 @1 f7 D - import random
, C2 R/ d+ A1 i - import datetime3 }* Y; M1 T1 m# K2 o
- import urllib2,HTMLParser,re" C! b! H" J2 k$ F+ ]3 ]5 `( ~
- import os2 `. z1 l8 g' S. w. J. h- Y+ X5 F
- import sys& f( U, I5 A# b3 V: m7 X+ C. L; I I
- import re
5 |# ?- C! {; B8 @4 E3 E9 f - import codecs
0 f2 C: L" h2 a6 ^! q - import requests
& f# Q0 o5 D2 h: ~* o - import json
" [6 K6 Y4 ?3 i. l; R) G2 z -
* r1 i& w. m( ]: x \, f! ?' y, t - class Handler(BaseHandler):
* @8 a( D1 S$ z/ f: V7 P! g - global Datos. a/ K* c) c* P3 m4 V' p# J
- global P_dir c) f5 X9 r- M/ A6 F
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径& C* L8 @0 Y3 Z! }& q# H% J' u
- global Datos: ]- R$ b9 B' n: w9 t8 x* s
- Datos = {}1 Q3 ~) ?8 c4 \. U. u# ^
- headers= {3 M( `: Z1 t3 M' c# u7 _3 f! J2 N
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
: T. Y: w7 ^" J& { - 'Accept-Encoding':'gzip, deflate, sdch',
2 X/ B* X5 ^9 @$ q @6 d, K - 'Accept-Language':'zh-CN,zh;q=0.8',8 A* P$ \. a4 t6 ~! K
- 'Cache-Control':'max-age=0',
) E2 l+ C& I$ r: R, U5 K - 'Connection':'keep-alive',
& L: R- N- }# l3 W - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'- a- }' s7 P; G! x
- }& ]7 L! f0 b( @
- crawl_config = {6 W/ [$ m4 o# M2 A
- 'headers' : headers,
# l8 D/ s% y: O# m. r* v9 B - 'timeout' : 300
! s- I; S% j1 k" H - }
3 F1 J9 g& j9 g* W( V - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
0 Y8 L, h& j+ K1 k. X - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")0 R% c; j3 J& h- f2 G
- try:
+ G& X' \( {; Y: m+ y( l - cursor = db.cursor()
) e- F7 f; D; D: R& \0 ?8 s: g; V- q - #注意此处字符串的占位符要加双引号"%s"
U- J4 D6 |; S% a+ B& d1 x6 y! o: C% d - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);8 c- j" x# b' ?, Q2 u
- # print(sql)* @3 v7 i+ V; y/ N/ Z1 C j
- cursor.execute(sql)
8 P- O% W$ D4 P! e( ]/ S+ c; W* e -
) o6 q: H8 V" j/ C" N - #qid = cursor.lastrowid( a- r& Y9 N9 v5 t7 |) ~
- #print(qid)0 [* o: N9 ^( q7 P2 W7 a+ U
- : j6 G3 K9 B1 b; D% m
- db.commit()4 ? t% `7 w: |8 N/ T. V
- except Exception as err:& E; l5 J8 [) W
- print("Error %s for execute sql: %s" % (err, sql))
0 `3 v+ v! V* i% ?) K- Q - db.rollback()0 v0 `$ v+ j. R ^# I% u' {
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
% J3 x" G# p5 X0 Y; x6 S - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"); V$ }( m i+ ^6 d
- try:
% F; ]2 s6 m* v" Z - cursor = db.cursor(), N# o4 U7 }+ g' N1 u8 g
- #注意此处字符串的占位符要加双引号"%s"0 s7 |* P9 \# r
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);# {+ _- X2 A8 L4 D: @2 r
- # print(sql)
4 W6 l1 F7 k8 r3 H: H6 M - cursor.execute(sql)
# B- B: }! F+ g, W9 ]) n -
* `, [9 [* K0 W5 G, C! U } - #qid = cursor.lastrowid
* S! B/ g6 H! s% M - #print(qid)* z9 P t$ G8 D2 ?4 o L
-
9 j# k5 Y- M) z - db.commit()
. q% T2 L! D- {% v - except Exception as err:9 C& @' W: i- W/ A$ b
- print("Error %s for execute sql: %s" % (err, sql))0 j/ U M( Z9 j* j2 i/ Q$ ~4 P
- db.rollback()5 k9 F: }8 v# G$ y6 A. ?
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
. A- K( a* Y6 k) J8 C! Z( P4 h& O5 G - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8") T6 H8 B% Q3 j7 |
- try:
; N# H, s6 h0 e; y8 K - cursor = db.cursor()! O! F) K/ v! }2 X* C: V7 C
- #注意此处字符串的占位符要加双引号"%s"
1 X: k1 D; _3 D - sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);- x) r1 L+ ?8 ~6 U) g+ K
- print(sql)$ }# R T+ F) S. e4 t. g
- cursor.execute(sql)
8 ~' r) ^8 G8 d! `& B" e - print(cursor.lastrowid)& u- |, Q: O1 y2 q* J
- db.commit()3 F6 U# y7 `$ v2 p/ U
- except Exception as err:
' K p- G, I! |( a - # except:! g$ P! G0 \' u" i
- # print('Failed')
3 _4 D7 A1 K3 D8 } - print("Error %s for execute sql: %s" % (err, sql))) [! a8 I' V1 ?- V, r0 l" q
- db.rollback()
. C8 G* w0 F B; [; w5 I4 H" b - , K+ ~/ b0 C0 I7 f L
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): + L7 m; J# E/ m3 f
- reload(sys)* r4 z3 B; K" b. f9 g
- sys.setdefaultencoding("gbk")
2 z% F' V0 q, B; @( U% i - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
. b5 E! }* J# o; j% m - locoy_data = {
+ L1 K+ L/ ?6 C5 n - 'my_u':'用户名', #后台用户名
/ C+ B2 L1 c% R4 C - 'my_p':'密码', #后台密码 M! ^$ t( j+ P; W9 k: e/ H- y$ _. X
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),
, N; [& X, [4 J( Y - 'caid':Cater_Name.encode('gbk', 'ignore'),, m$ O5 s! ]! a9 w
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),# e8 N' }0 }* Z0 y4 @
- 'article':BookConte.encode('gbk', 'ignore'),2 V# q% i6 x6 a. {4 G
- 'author':Book_author.encode('gbk', 'ignore'),
- B* M5 L/ B) P - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),& X5 y4 V' m [4 F7 a. r( R
- 'thumb':Book_img,9 N& K3 F$ y J
- 'content':Book_Introduction.encode('gbk', 'ignore'),/ m# y) D. [4 ~% Y9 f
- 'abover':abover.encode('gbk', 'ignore') ) C3 }; {3 b% U5 M, A1 X3 G
- }
! u1 H, l6 D: T, I) C& E, K5 P - res = requests.post(locoy_url, data=locoy_data)
; e) ]1 B* {% C2 \7 t0 ^9 @ D6 w - print res.text7 T4 S7 j4 b$ {* ?( L2 N% }
- print res.content, U' W, d; M, N7 ]6 G; {9 {& `$ R
- # print Dsd
T' z, S3 |1 I1 z* x- Y - return res) a7 c" T4 n, l6 F
-
; a# |- O N3 G6 ? - def __init__(self):+ l2 A+ q$ O# L( w8 Z& P4 T
- self.base_url1 = 'https://www.****.cc/'' s1 m- Q5 d( ]3 i5 N2 b0 j
- self.base_url2 = '/': x$ i! W- R- Y6 [& {
- self.CaterId = []
1 c6 @/ u/ ~4 {9 |; T) @ - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']$ Q6 |6 z# p5 _
- self.page_num = 10 R6 C$ d! O4 W# P! z, Z
- self.total_num = 200
! U l: X6 d; p( ~ - : Y' s% j* d& C* N+ O
- @every(minutes=8 * 60)
" T0 v, J# t9 A( n# T9 o - def on_start(self):
8 m& b( l" I9 r9 n - global Cater_Name4 [4 }& I" }% B+ w+ \, P1 w
- Cater_Name = []& `# o8 c j) ?3 m- l8 X! m! D
- while self.page_num <= self.total_num:
0 E3 W# L' l& T2 s - for self.CaterId in self.CaterIds:
+ K9 m2 g+ @ h! c) M- l) q. ^ - if self.CaterId == 'xuanhuan':
; Z" D1 ]$ V: Y. t - Cater_Name = '玄幻'& j( O7 F2 p) O [% K
- if self.CaterId == 'wuxia':
* J& L. J4 q' g" E- d - Cater_Name = '武侠'
4 {/ f* u, v& ^! w) _ - if self.CaterId == 'lishi':
# r O; G5 V# f" o) K7 H) j/ D - Cater_Name = '历史' , n2 C/ z) P9 x9 {* f
- if self.CaterId == 'yanqing':
1 M9 I% `8 M/ P( S - Cater_Name = '都市' * F' h4 N9 [9 @" T) Z0 a
- if self.CaterId == 'nvsheng':
1 M2 X, P. m. f) ]7 M, ^ - Cater_Name = '都市'
9 g9 A! C# {. o$ ?; o! g8 J9 r - if self.CaterId == 'kehuan':. I- z4 M) b+ Z/ |3 S. l0 I
- Cater_Name = '科幻' / y; a/ i; L9 D- l% S1 G, e" S
- if self.CaterId == 'kongbu':
4 |: F3 v$ E9 J. \: ?- D! y ^7 e - Cater_Name = '游戏' . Y( m. l$ J2 v
- print self.CaterId8 }# R6 {8 f$ @2 @# z8 f
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
" N- S1 g0 ?, h9 b - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)" }. F. t# ^( j4 W
- self.page_num += 1
* Z. [, W; n; C5 o -
2 {/ o/ e }2 g) L - def list_Caterg(self, response):1 Q8 x6 m9 J# j4 C8 u9 Q- h* }
- Cater_Name = response.save) O4 ]0 P+ W3 X6 D7 a/ m
- for each in response.doc('.pic-list a[href^="http"]').items():
: |, N9 |# R5 }' z) k - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
P; G- ?- m( y- g2 F0 ] -
; W; b8 r9 z; O, Z9 G) w - def list_Caterg_detail(self, response):
( B% i, t8 m# K - Cater_Name = response.save
6 W$ F. g; g2 ~2 K. F, g( J8 L - # print Cater_Name
/ s7 h* h% M8 d0 t - Bookname = response.doc('h1').text()* L' ?4 F$ D+ e: ^! q5 S
- print Bookname r5 U# X& y! z
- Book_author = response.doc('.authorname > a').text()1 _& Z; x- D$ S1 J
- # print Book_author0 v; g+ \3 o$ A; F& Z4 S3 v
- Book_Introduction = response.doc('.book-intro > div').text()
: ]; G5 X/ `8 t& ]2 v - # print Book_Introduction
/ Q7 f! M1 O( k/ n& I - Book_Synopsis = response.doc('b').eq(1).text()8 x' M9 [: \$ G8 W- ^
- # print Book_Synopsis5 g+ ]8 s0 k0 f4 R g- P2 s" b0 ?
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]5 p# E9 T9 D: M5 G6 R. H
- # print Book_Palabras
( `$ A# S8 Q4 t5 S7 n7 a - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID, [# @- x5 a1 `. L# s7 `1 w& l5 i6 t
- # print BookIDs
D& |/ \: A! Y# f - Book_Dates = str(datetime.datetime.now())
" K1 ^& i; y& _4 E+ g - for imgs in response.doc('.bigpic > img[src^="http"]').items():
4 c0 p3 ~% w* m& } - img = imgs.attr.src
) ?3 _, s: [4 [8 M; X8 D - print img" ?+ F) W# n% N& [1 s) b
- #小说封面下载
5 ?+ I% Q+ y1 R - extension = self.getExtension(img)3 h# [# \; o8 W0 ?. m v
- name = self.getname(img)
5 O# P; e# }7 J - file_name = name + "." + extension
9 h# E' q% i* K V y. g' W - imgDir = P_dir + name
7 r( f+ ? [% Y1 F - Locaimg = imgDir + "/" + file_name
: C: S/ U2 E9 o. f - print Locaimg9 G9 r1 s0 A. V$ a0 L+ w. |
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
' Z( u2 }* U" a1 X - print('attachment url is ' + img) #
9 `* m" ~, Y- Y, F6 P) T! `2 Y - Datos = {
% ~( k. y0 O8 J+ q" N - "Cater_Name":Cater_Name,( y8 _+ Y) `$ n: @# \
- "Book_author":Book_author,* F [7 U6 E0 G9 L1 D9 y
- "Book_Introduction":Book_Introduction,
, e3 R; n7 Q+ ]0 W2 S - "Book_Synopsis":Book_Synopsis,2 H: t+ d' C! o( @4 R
- "Book_Palabras":Book_Palabras,& Y! l# O% b- n+ Z, i
- "img":img,: l5 p3 i- N, M2 z
- }
_5 n7 i( T5 `3 q1 C - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布
' v! ]2 f) X- @7 N - for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():' m2 w: g& f4 @9 }( @! B. G
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
9 r: o8 r& x0 P' j -
& N( i! D+ n0 N! [2 ` c - @config(age=8 * 60 * 60)
3 ~6 A$ w( F( x4 |& a! v - def index_page(self, response):
6 |/ |# _" B) m. _ - Datos = {& I" S; m- } T) O& A" N" i* @0 @" r
- "Cater_Name":response.save['Cater_Name'],9 U4 y* l2 o* E
- "Book_author":response.save['Book_author'],
2 J- p! W6 @0 F - "Book_Introduction":response.save['Book_Introduction'],; i8 M4 E* j7 e1 J$ F
- "Book_Synopsis":response.save['Book_Synopsis'],
" a# _. \/ l" t6 `9 l4 |3 q: G$ O - "Book_Palabras":response.save['Book_Palabras'],4 g0 g2 u( j7 j& J! i
- "img":response.save['img'],
. F* G, q/ j. | - }
2 }' R5 h2 [. f! g5 `) G - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
8 a7 V4 R+ T8 K; i. {5 O/ { - # for each in response.doc('.chapter-list a[href^="http"]').items():
. }( W7 |' l; g$ |! h- ? - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
% i3 j. W3 U6 [2 e- ]* |* H- J0 ~2 j - @config(priority=2)4 V ~. }) l) t
- @catch_status_code_error
0 s( {% L. l( B ^& @. |& \( C - def detail_page(self, response):
( f; L, `3 I# q: b9 q" t - NewRe1 = u'哈书'8 Z( {0 C: O& _$ j
- NewRe2 = u'huhjsd.CC'
, [) Q% L0 ?. S - NewRe3 = r'^\\n\\n'& k' Q2 k6 Y2 ^6 J' Z: {6 e
- NewRe5 = u'小说网'5 z7 l3 Z" b r6 x1 k0 z4 Y
- NewRe6 = u'fgdfgf'( n6 Z8 l, P8 c2 ]! }8 W
- NewRe7 = u'fgfgf'
, S9 Y# ^' {! o& s+ Y+ n+ f" L - NewRe8 = u'ffhgf'3 u |; k3 R3 ~& K: A# }
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'3 \( R5 b% _5 l2 _4 d2 {: j+ R
- ReC1 = u'静思'
/ h" W* {* ^! f: C/ T+ l1 O- b - ReC2 = u'aghgf.com': U. _; W8 [( I
- ReC3 = u'aghgfh.com'
8 D7 f D/ d0 l9 H U; w6 b - ReC4 = u''9 K" e1 M" s& R8 l# F% v/ i
- ReC5 = u'文学网'
. m- f- G- G+ ?$ C3 r3 E, \8 e4 C( n - ReC6 = r'<BR>'
& i3 p( q3 ` X0 W - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称! R: }1 L9 f0 @% j) G
- print Bookname
6 a! Z5 H7 O, X+ \5 o - Cater_Name = response.save['Cater_Name'] # 小说分类7 r& L' D9 |4 R4 P4 d# s! x0 ]+ Q" q' D
- Book_author = response.save['Book_author'] #小说作者& S1 T# m& Q9 p6 c, W5 D3 b1 q
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
$ H7 V) u2 L9 r# `% w - Book_Synopsis = response.save['Book_Synopsis'] #最近更新, v: h; \ [* Z
- Book_Palabras = response.save['Book_Palabras'] #小说字数
: M% B+ [9 A7 @% b8 I; g - Bookurl = response.url #小说网址/ k! }+ a5 G3 c* `. k9 x
- Booktitle = response.doc('.article-title').text() #章节名称
( k! `& Q. Q/ B& L. {" E. X$ E8 H - BookID = response.doc('.readset-r span').text() #小说ID
- K3 `# y! o' C0 a$ a, A - BookConte1 = response.doc('.article-con').text() #小说章节内容/ a0 T, |( {% t4 a
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)% }* G' _% _- I/ j
- Book_Date = str(datetime.datetime.now()) # 采集时间
0 \0 g9 W; v& w* l/ {# M R" B - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
7 r. D7 I" Y, G- s2 A - BookConte3 = BookConte2.replace(NewRe2 , ReC2)2 b. Q" H9 ?$ z
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)! Z, U% k, K2 ~ h f" {
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
. |0 d' V3 X# `- V3 z# g - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
: ]/ R. V* B- d2 L* z. f - BookConte8 = BookConte7.replace(NewRe3 , ReC6)7 Z4 X8 I8 K. e+ P3 i# M, A
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
. p, V& t7 ]* R2 s4 w - BookConte = BookConte4.replace("\n\n","<br>")7 N0 N6 ?5 | d7 Q5 U
- print BookConte( L3 Q! s' Z, M/ B3 F8 B- C
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
7 V8 n/ W! O/ S9 F7 ` - Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)& Z6 i( r# Q# r, E" C
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)5 \8 ?" C! m0 z- o# O4 V3 ]
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
: p9 J# {; L k - Titleid = response.url.split(BookID + "/")[-1].split("/")[0] , z. J9 C0 `# ~6 Y
- Book_img = response.save['img'], #小说图片
) \! C1 f7 y! x7 H9 |& { -
L! p) q% M8 c - #insert into MySQL 小说入库+ t1 X2 V( n4 e
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
: ^ Q1 k6 @$ D- [* {) E, \" L - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
3 R" z) M# V. t* q - #post提交发布
" P0 v5 v/ c+ k - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消% W7 U7 y. ]$ b" X+ ]
- Datos = {9 Y4 W+ Y }6 M$ d. K; ^ v. h
- "Cater_Name":response.save['Cater_Name'],% \; b( H) I8 H3 Z h
- "Book_author":response.save['Book_author'],
4 A |. V, L1 d* a( g - "Book_Introduction":response.save['Book_Introduction'],
7 b# A$ A: u/ y x# y: ]/ W5 |7 ]' Z - "Book_Synopsis":response.save['Book_Synopsis'],# a# M( `, T; s- q- |* ?+ G. b% N3 C, v
- "Book_Palabras":response.save['Book_Palabras'],( t# B" U0 X( h
- "img":response.save['img'],
" H, M2 C. V9 z) ^' r - }
; D7 o n& L+ S" ] - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
L* {! u7 h8 T$ g - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
# ~7 e+ t3 C& W2 J+ r& Y - return {
/ ?' a$ e3 S. T9 G - "Cater_Name":Cater_Name,4 {1 ^7 ^" ?1 p: {
- "Bookname":Bookname,% Y5 ?- w; J+ L/ ^
- "Book_author":Book_author,
8 P- f9 a% U6 x1 c - "Book_Introduction":Book_Introduction,, _4 v- c! R* h! {# r
- "Book_Synopsis":Book_Synopsis,
' v$ j4 ?+ p1 l. ]# _ J - "Book_Palabras":Book_Palabras,
- m v+ z& C+ Q8 a - "Book_img":Book_img, { Y$ k4 @3 E% J
- "Bookurl": response.url,
; s) }2 w V! E2 j+ K5 y+ B - "Booktitle": Booktitle,
" p* b% V7 v) X1 ]6 |/ A - "BookID": BookID,
' c3 L6 U4 r$ E8 A: y) J) k - "BookConte": BookConte,! g6 w2 }! n: `" D7 N1 ^
- "Titleid": Titleid,
% [9 w6 ?( a$ O ? - "abover":abover,' y2 }3 d* ]" |9 n2 x
- # "Book_Date" = str(datetime.datetime.now()),
" ?( U/ l8 ~* v. ]& B- c1 ? - }$ `. _( r3 k( @) m) @
- def download(self, P_dir, imgDir, file_name, Book_img):
, Y. j; Z' N- [; h- _: D - if not os.path.exists(imgDir): 1 Y$ {$ r* B' ~4 V, t( S
- os.makedirs(imgDir)3 S) G9 j( u( E
- file = imgDir + "/" + file_name
, m0 d; ]! t; K - # print file* E; g. Q' u/ S+ Q* i+ S: }
- f = open(file, 'wb+')5 c( n0 @% Y( X1 `
- imag = requests.get(Book_img) + H/ k1 u4 ]( X- m
- f.write(imag.content)
( x, s C* t$ k E( r - f.close()2 Y, }' [8 u) B% V
- #保存图片前
, `0 N8 S" q* u( u# s - def save_imgs(self,response):8 B i! R4 E \3 ~
- content = response.content& N, r/ [' F8 S( M
- file_name = response.save["file_name"] A2 m) F ^& E/ ^
- imgDir = response.save["imgDir"]
4 K' t; o2 G, i) B8 [ - file_path = imgDir + file_name
2 g0 h( u, a# e3 m$ L" Y - self.save_img(content,imgDir,file_path)
/ x9 ~$ {2 U. C1 b3 Y" t - #保存图片9 ~1 @7 T9 O! X* K0 y+ I
- def save_img(self,content,imgDir,path):1 N* S0 \7 a" j9 v; s0 z0 p( G3 j- [
- if not os.path.exists(imgDir): 2 }' \! N8 z; n8 u
- os.makedirs(imgDir)
6 L- p, r1 j5 A3 D( g( @: {& x: o - f = open(path,"wb" )7 y0 ]/ x3 v6 [6 c1 e. O+ q
- f.write(content)6 N. N' D2 v& {* v" N8 g
- f.close()
% b% Q0 Z, k& \1 \6 E/ T - #获取url后缀名
) [0 x0 B' H$ b6 x; ]: b* L. y - def getExtension(self,url): 7 A3 D$ K' M; \
- extension = url.split(".")[-1]
, |* N H/ ?* U9 y6 f - return extension 8 K w6 [* O6 x
-
9 b4 n) g" k7 F3 i - #获取图片名
: C" S5 U7 h1 ]- c) w - def getname(self,url):) q$ d( B8 W9 W5 K7 J% ]5 |
- name=url.split("/")[-1].split(".")[0]" B- s$ u, e' u* u& J7 a) w
- return name
复制代码
4 T; w+ M, N( b' v. d( P * S4 J/ I( ?. y4 G( j7 r4 D
|