68 儲存資料

2022-09-22 08:37:02 字數 3638 閱讀 9862

資料持久化的首選方案應該是關係型資料庫,關係型資料庫的產品很多,包括:oracle、mysql、sqlserver、postgresql等。如果要儲存海量的低價值資料,文件資料庫也是不錯的選擇,mongodb是文件資料庫中的佼佼者,有興趣的讀者可以自行研究。

下面的**演示瞭如何使用mysql來儲存從知乎發現上爬取到的連結和頁面。

create

database zhihu default

charset utf8;

create

user "zhihu"@"%" identified by "zhihu.618

";grant

allprivileges

on zhihu.*

to "zhihu"@"%

";flush

privileges

;use

zhihu;

create

table

tb_explore (

id integer

auto_increment,

url

varchar(1024) not

null

, page longblob

notnull

, digest

char(48) unique

notnull

, idate

datetime

default

now(),

primary

key(id)

);

import

hashlib

import

pickle

import

reimport

zlib

from urllib.parse import

urljoin

import

pymysql

from bs4 import

beautifulsoup

import

requests

conn = pymysql.connect(host="

localhost",

#port=3306,

user="

zhihu

", password="

zhihu.618",

database="

zhihu

", charset="

utf8",

autocommit=true)

defwrite_to_db(url, page, digest):

try:

with conn.cursor() as cursor:

cursor.execute(

"insert into tb_explore (url, page, digest) values (%s, %s, %s)",

(url, page, digest))

conn.commit()

except

pymysql.mysqlerror as err:

print

(err)

defmain():

base_url = "

"seed_url = urljoin(base_url, "

explore")

headers =

try:

resp = requests.get(seed_url, headers=headers)

soup = beautifulsoup(resp.text, "

lxml")

href_regex = re.compile(r"

^/question")

for a_tag in soup.find_all("

a", ):

href = a_tag.attrs["

href"]

full_url =urljoin(base_url, href)

digest =hashlib.sha1(full_url.encode()).hexdigest()

html_page = requests.get(full_url, headers=headers).text

zipped_page =zlib.compress(pickle.dumps(html_page))

write_to_db(full_url, zipped_page, digest)

finally

: conn.close()

if__name__ == '

__main__':

main()

import

hashlib

import

pickle

import

reimport

zlib

from urllib.parse import

urljoin

import

redis

from bs4 import

beautifulsoup

import

requests

defmain():

base_url = "

"seed_url = urljoin(base_url, "

explore")

client = redis.redis(host="

1.2.3.4

", port=6379, password="

1qaz2wsx")

headers =

resp = requests.get(seed_url, headers=headers)

soup = beautifulsoup(resp.text, "

lxml")

href_regex = re.compile(r"

^/question")

for a_tag in soup.find_all("

a", ):

href = a_tag.attrs["

href"]

full_url =urljoin(base_url, href)

field_key =hashlib.sha1(full_url.encode()).hexdigest()

ifnot client.hexists("

spider:zhihu:explore

", field_key):

html_page = requests.get(full_url, headers=headers).text

zipped_page =zlib.compress(pickle.dumps(html_page))

client.hset(

"spider:zhihu:explore

", field_key, zipped_page)

print("

total %d question pages found.

" % client.hlen("

spider:zhihu:explore"))

if__name__ == '

__main__':

main()