小狐狸事務所: Python 學習筆記 : 市圖借書與預約爬蟲程式改版 v12 (於 Pi 400)

2025年12月9日星期二

Python 學習筆記 : 市圖借書與預約爬蟲程式改版 v12 (於 Pi 400)

完成母校圖書館爬蟲程式改版移植到 Pi 3 A+ (Trixie OS) 後, 下午繼續移植市圖爬蟲程式, 主要是必須改用新的 chromium 與 chromium-driver 套件 :

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.chrome.service import Service

寫法也要用新的方式 :

options=Options()

options.add_argument("--headless=new")

options.add_argument("--no-sandbox")

options.add_argument("--disable-dev-shm-usage")

options.binary_location="/usr/bin/chromium"

service=Service("/usr/bin/chromedriver")

browser=webdriver.Chrome(service=service, options=options)

參考 :

# Python 學習筆記 : 母校圖書館借書與預約爬蟲程式改版 v10

套件安裝在前一篇均已完成, 直接近虛擬環境, 編輯程式與執行 :

pi@pi3aplus:~ $ source ~/myenv313/bin/activate

(myenv313) pi@pi3aplus:~ $ nano ksml_lib_12.py

輸入新版程式 :

# ksml_lib_12.py

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.chrome.service import Service

import re

from datetime import datetime

import time

import requests

import sys

async def telegram_send_text(text):

bot=Bot(token=token)

try:

await bot.send_message(

chat_id=chat_id,

text=text

)

return True

except Exception as e:

print(f'Error sending text: {e}')

return False

def get_books(account, password):

try:

# 登入我的書房

options=Options()

options.add_argument("--headless=new")

options.add_argument("--no-sandbox")

options.add_argument("--disable-dev-shm-usage")

options.binary_location="/usr/bin/chromium"

service=Service("/usr/bin/chromedriver")

browser=webdriver.Chrome(service=service, options=options)

browser.implicitly_wait(60)

browser.get('https://webpacx.ksml.edu.tw/personal/')

loginid=browser.find_element(By.ID, 'logxinid')

loginid.send_keys(account)

pincode=browser.find_element(By.ID, 'pincode')

pincode.send_keys(password)

div_btn_grp=browser.find_element(By.CLASS_NAME, 'btn_grp')

login_btn=div_btn_grp.find_element(By.TAG_NAME, 'input')

login_btn.click()

# 擷取借閱紀錄

div_redblock=browser.find_element(By.CLASS_NAME, 'redblock')

div_redblock.click()

books=browser.find_elements(By.CLASS_NAME, 'bookdata')

borrow_books=[]

for book in books:

item=dict()

book_name=book.find_element(By.XPATH, './h2/a').text

item['book_name']=book_name.replace('/', '').strip()

book_site=book.find_element(By.XPATH, './ul[3]/li[1]').text

reg=r'典藏地：(\S+)'

item['book_site']=re.findall(reg, book_site)[0]

reg=r'\d{4}-\d{2}-\d{2}'

due_date=book.find_element(By.XPATH, './ul[4]/li[2]').text

item['due_date']=re.findall(reg, due_date)[0]

due_times=book.find_element(By.XPATH, './ul[5]/li[1]').text

item['due_times']=re.findall(r'\d{1}', due_times)[0]

try:

state=book.find_element(By.XPATH, './ul[6]/li[1]').text

except:

state=''

finally:

if '有人預約' in state:

item['state']=', 有人預約'

else:

item['state']=''

borrow_books.append(item)

print('擷取借閱紀錄 ... OK')

browser.back() # 回上一頁

# 擷取預約紀錄

div_blueblock=browser.find_element(By.CLASS_NAME, 'blueblock')

div_blueblock.click()

books=browser.find_elements(By.CLASS_NAME, 'bookdata')

reserve_books=[]

for book in books:

item=dict()

book_name=book.find_element(By.XPATH, './h2/a').text

item['book_name']=book_name.replace('/', '').strip()

sequence=book.find_element(By.XPATH, './ul[7]/li[1]').text

if '預約待取' in sequence: # 已到館

item['ready_for_pickup']=True

reg=r'\d{4}-\d{2}-\d{2}'

item['expiration']=re.findall(reg, sequence)[0]

item['sequence']='0'

else: # 預約中

item['ready_for_pickup']=False

item['expiration']=''

item['sequence']=re.findall(r'\d+', sequence)[0]

reserve_books.append(item)

browser.close()

print('擷取預約紀錄 ... OK')

return (borrow_books, reserve_books)

except Exception as e:

print(e)

return None, None

if __name__ == '__main__':

start=time.time()

token='我的 Telgram 權杖'

chat_id='聊天室 ID'

if len(sys.argv) != 3:

print('用法: python3 ksml_personal_12.py 帳號密碼')

sys.exit(1)

# 取得傳入的帳密參數

account=sys.argv[1]

password=sys.argv[2]

# 呼叫 get_books() 取得借書與預約書

borrow_books, reserve_books=get_books(account, password)

b_msg='' # 借書資訊字串初始值

r_msg='' # 預約資訊字串初始值

# 處理借書

if borrow_books:

borrow=[]

for book in borrow_books:

book_name=book['book_name']

book_site=book['book_site']

due_times=book['due_times']

due_date=book['due_date']

state=book['state']

due_date=datetime.strptime(due_date, '%Y-%m-%d') # 到期日

today_str=datetime.today().strftime('%Y-%m-%d')

today=datetime.strptime(today_str, "%Y-%m-%d")

delta=(due_date-today).days # 計算離到期日還有幾天

if delta < 0: # 負數=已逾期

msg=f'🅧 {book_name} (逾期 {abs(delta)} 天{state}, {book_site})'

borrow.append(msg)

elif delta == 0: # 0=今天到期

msg=f'⓿ {book_name} (今日到期, 續借次數 {due_times}{state}, {book_site})'

borrow.append(msg)

elif delta == 1: # 1=明天到期

msg=f'❶ {book_name} (明日到期, 續借次數 {due_times}{state}, {book_site})'

borrow.append(msg)

elif delta == 2: # 2=後天到期

msg=f'❷ {book_name} (後天到期, 續借次數 {due_times}{state}, {book_site})'

borrow.append(msg)

elif 2 < delta < 8: # 3 天以上一周內到期

msg=f'✦ {book_name} ({book["due_date"]} 到期, '\

f'續借次數 {due_times}{state}, {book_site})'

borrow.append(msg)

# 製作借書到期摘要字串

if len(borrow) != 0:

borrow.insert(0, f'\n❖ {account} 的借閱 :')

b_msg='\n'.join(borrow) # 更新借書資訊字串

print('產生借書到期摘要 ... OK')

# 處理預約書

if reserve_books:

reserve=[]

i=0

j=['①', '②', '③', '④', '⑤']

k=['❶', '❷', '❸', '❹', '❺']

# 預約狀態

for book in reserve_books:

book_name=book['book_name']

sequence=book['sequence']

ready_for_pickup=book['ready_for_pickup'] # 已到館

expiration=book['expiration'] # 取書截止日

if ready_for_pickup:

msg=f'{k[i]} {book_name} (已到館, 保留期限 {expiration})'

else:

msg=f'{j[i]} {book_name} (順位 {sequence})'

reserve.append(msg)

i += 1

# 製作預約書摘要字串

if len(reserve) != 0:

reserve.insert(0, f'\n❖ {account} 的預約 :')

r_msg='\n'.join(reserve) # 更新資訊字串

print('產生預約書摘要 ... OK')

if b_msg or r_msg: # 任一不為空字串就更新資料表

url="https://serverless-fdof.onrender.com/function/update_ksml_books"

payload={

"account": account,

"borrow_books": b_msg,

"reserve_books": r_msg

}

res=requests.post(url, json=payload)

print(res.json())

end=time.time()

print(f'執行時間:{end-start}')

此處爬蟲結果放在字典中向佈署於 render.com 的 serverless 平台上的 update_ksml_books 函式提出 POST 請求, 它會將爬蟲訊息儲存在該平台的 SQLite 資料庫 serverless.db 上 :

# update_ksml_books.py

import sqlite3

from datetime import datetime, timedelta

def main(request, **kwargs):

"""

POST 請求範例：

{

"account": "tony",

"borrow_books": "書名1 (到期日 2025-10-22); 書名2 ...",

"reserve_books": "書名A (已到館); 書名B (順位 2) ..."

}

"""

DB_PATH='./serverless.db'

try: # 從 POST 請求 body 中解析 JSON 格式資料並轉成 Python 字典

data=request.get_json(force=True)

except Exception as e:

return {"status": "error", "message": f"解析 JSON 失敗: {str(e)}"}

# 從字典中取得參數值

account=data.get('account')

borrow_books=data.get('borrow_books', '')

reserve_books=data.get('reserve_books', '')

# 檢查主鍵 account

if not account:

return {"status": "error", "message": "缺少帳號資訊"}

try: # 更新 ksml_books 資料表

conn=sqlite3.connect(DB_PATH)

cur=conn.cursor()

# 建立 ksml_books 資料表 (若不存在)

cur.execute("""

CREATE TABLE IF NOT EXISTS ksml_books (

account TEXT PRIMARY KEY,

borrow_books TEXT,

reserve_books TEXT,

updated_at TEXT

)

""")

# 統一用 UTC 現在時間 + 8 取得台灣目前時間

utc_now=datetime.utcnow()

taiwan_now=utc_now + timedelta(hours=8)

now_str=taiwan_now.strftime('%Y-%m-%d %H:%M:%S')

# 使用 INSERT OR REPLACE 寫入紀錄，如果帳號已存在就更新

cur.execute("""

INSERT OR REPLACE INTO ksml_books (account, borrow_books, reserve_books, updated_at)

VALUES (?, ?, ?, ?)

""", (account, borrow_books, reserve_books, now_str))

conn.commit()

conn.close()

return {"status": "success", "message": f"{account} 的資料已更新"}

except Exception as e:

return {"status": "error", "message": str(e)}

執行結果 :

(myenv313) pi@pi3aplus:~ $ python ksml_lib_12.py xxxx118 xxxx27

擷取借閱紀錄 ... OK

擷取預約紀錄 ... OK

產生借書到期摘要 ... OK

產生預約書摘要 ... OK

{'message': 'xxxx118 的資料已更新', 'status': 'success'}

執行時間:348.5533776283264

但我設定 crontab 去跑爬蟲無結果, 手動測試發現不穩定, 常會出現錯誤 :

pi@pi3aplus:~ $ /home/pi/myenv313/bin/python /home/pi/ksml_lib_12.py xxxx119 xxxx16

Message: session not created

from chrome not reachable; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception

Stacktrace:

#0 0x005593e0d070 <unknown>

#1 0x0055938df220 <unknown>

#2 0x0055938cf970 <unknown>

#3 0x00559391894c <unknown>

#4 0x005593915500 <unknown>

#5 0x005593911090 <unknown>

#6 0x005593952ee4 <unknown>

#7 0x00559395286c <unknown>

#8 0x00559391c1f0 <unknown>

#9 0x005593dd8c2c <unknown>

#10 0x005593ddbc84 <unknown>

#11 0x005593ddb878 <unknown>

#12 0x005593dc40a0 <unknown>

#13 0x005593ddc2e0 <unknown>

#14 0x005593daf2a0 <unknown>

#15 0x005593df9fc0 <unknown>

#16 0x005593dfa1b0 <unknown>

#17 0x005593e0bd24 <unknown>

#18 0x007fa1115f74 <unknown>

#19 0x007fa117de88 <unknown>

產生預約書摘要 ... OK

執行時間:94.88470888137817

問 AI 好像是新版 Chromium 更吃記憶體, 新版 Selenium 對記憶體要求也較高, 即使是用 headless 也是容易讓 Chrome 物件建不起來.

我依照 AI 建議將 swap 記憶體加大為 2GB, shm 從 64MB 擴大為 512MB 也沒用 :

pi@pi3aplus:~ $ swapon --show

NAME TYPE SIZE USED PRIO

/dev/zram0 partition 416M 352.1M 100

pi@pi3aplus:~ $ sudo swapoff -a

強制結束

pi@pi3aplus:~ $ sudo fallocate -l 2G /swapfile

pi@pi3aplus:~ $ sudo chmod 600 /swapfile

pi@pi3aplus:~ $ sudo mkswap /swapfile

Setting up swapspace version 1, size = 2 GiB (2147479552 bytes)

no label, UUID=c0c0c57c-3115-418c-9c24-b3571a36a773

pi@pi3aplus:~ $ sudo swapon /swapfile

pi@pi3aplus:~ $ grep -q '/swapfile' /etc/fstab || echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab/swapfile none swap sw 0 0

pi@pi3aplus:~ $ sudo mount -o remount,size=512M /dev/shm

mount: (hint) your fstab has been modified, but systemd still uses

the old version; use 'systemctl daemon-reload' to reload.

pi@pi3aplus:~ $ free -h

total used free shared buff/cache available

Mem: 416Mi 269Mi 83Mi 24Mi 138Mi 146Mi

Swap: 2.4Gi 333Mi 2.1Gi

pi@pi3aplus:~ $ df -h /dev/shm

檔案系統容量已用可用已用% 掛載點

tmpfs 512M 0 512M 0% /dev/shm

pi@pi3aplus:~ $ swapon --show

NAME TYPE SIZE USED PRIO

/dev/zram0 partition 416M 353.5M 100

/swapfile file 2G 0B -2

但同樣程式在 Pi 400 卻可順利運行 (而且速度也快很多) :

(myenv313) pi@raspberrypi:~ $ python ksml_lib_12.py xxxx119 xxxx16

擷取借閱紀錄 ... OK

擷取預約紀錄 ... OK

產生借書到期摘要 ... OK

產生預約書摘要 ... OK

{'message': 'xxxx119 的資料已更新', 'status': 'success'}

執行時間:20.821362495422363

所以 Pi 3 要嘛退回 Buster, 要嘛維持目前 Trixie 但只做 Selenium 以外的爬蟲用途了.

在 serverless 平台上還有一個程式 send_ksml_books_messages.py 負責取出 serverless.db 的 ksml_books 資料表全部內容取出傳送到 Telegram, 程式如下 :

# send_ksml_books_messages.py

import asyncio

import sqlite3

from telegram import Bot

async def telegram_send_text(token, chat_id, text):

"""非同步傳送 Telegram 訊息"""

try:

bot=Bot(token=token)

await bot.send_message(chat_id=chat_id, text=text)

return True

except Exception as e:

print(f"傳送失敗: {e}")

return False

def main(request, **kwargs):

DB_PATH='./serverless.db'

config=kwargs.get('config', {})

telegram_token=config.get('TELEGRAM_TOKEN')

telegram_id=config.get('TELEGRAM_ID')

if not telegram_token or not telegram_id:

return '未設定 TELEGRAM_TOKEN 或 TELEGRAM_ID'

try: # 連線資料庫

conn=sqlite3.connect(DB_PATH)

cur=conn.cursor()

cur.execute("SELECT borrow_books, reserve_books FROM ksml_books;")

rows=cur.fetchall()

conn.close()

except Exception as e:

return f'資料庫讀取失敗: {e}'

if not rows:

return '沒有任何資料可傳送'

# 傳送訊息

success_count=0

fail_count=0

for borrow_books, reserve_books in rows:

for msg in [borrow_books, reserve_books]:

if msg and msg.strip():

ok=asyncio.run(telegram_send_text(telegram_token, telegram_id, msg))

if ok:

success_count += 1

else:

fail_count += 1

return f'傳送完成：成功 {success_count} 筆，失敗 {fail_count} 筆'

只要在本地樹莓派 crontab 定期向 serverless 的 send_ksml_books_messages.py 提出一個 GET 請求就會觸發它送出 Telegram 訊息了, 例如下面這個在 Pi 3 A+ 上的 get_ksml_books_messages.py :

# get_ksml_books_messages.py

import requests

url="https://serverless-fdof.onrender.com/function/send_ksml_books_messages"

res=requests.get(url)

print(res)

這模式雖然較複雜, 但未來在製作 Telegram 聊天機器人時會較方便, 參考 :

# Python 學習筆記 : 市圖借書與預約爬蟲程式改版 v11

沒有留言 :

張貼留言

訂閱：張貼留言 ( Atom )

小狐狸事務所

2025年12月9日星期二

Python 學習筆記 : 市圖借書與預約爬蟲程式改版 v12 (於 Pi 400)

沒有留言 :

文章標籤

常用連結

2025年12月9日 星期二

Python 學習筆記 : 市圖借書與預約爬蟲程式改版 v12 (於 Pi 400)

沒有留言 :

2025年12月9日星期二