將母校圖書館爬蟲升版後打鐵趁熱, 今天順便將此次優化 Selenium 爬蟲的技巧也套用在市圖爬蟲程式上, 可同時於 Pi 400, Pi 3B, 與 Pi 3A+ 上執行.
本系列全部測試文章索引參考 :
新版 v13 程式碼如下 :
# ksml_lib_13.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import re
from datetime import datetime
import time
import requests
import sys
from dotenv import dotenv_values
import os
import socket
async def telegram_send_text(text):
bot=Bot(token=TELEGRAM_TOKEN)
try:
await bot.send_message(
chat_id=TELEGRAM_ID,
text=text
)
return True
except Exception as e:
print(f'Error sending text: {e}')
return False
def get_books(account, password):
browser=None
result=(None, None) # 預設回傳值
try:
# 登入我的書房
# 設定一個在 SD 卡上的暫存目錄 (for Trixie)
chrome_tmp_path=os.path.expanduser('~/chrome_tmp')
if not os.path.exists(chrome_tmp_path):
os.makedirs(chrome_tmp_path)
options=Options()
options.add_argument("--headless=new") # 新版無頭擬真瀏覽器
options.add_argument("--no-sandbox") # Trixie 必加
options.add_argument("--disable-dev-shm-usage") # 避免擠爆 /dev/shm
options.add_argument('--disable-gpu') # 避免 GPU 驅動崩潰
# 強迫使用 SD 卡空間 (特別是 Trixie 必須)
options.add_argument(f'--user-data-dir={chrome_tmp_path}')
# 限制快取大小為 100MB (防止 chrome_tmp 資料夾隨著時間變得巨大)
options.add_argument('--disk-cache-size=104857600')
options.binary_location='/usr/bin/chromium'
service=Service('/usr/bin/chromedriver')
browser=webdriver.Chrome(service=service, options=options)
browser.implicitly_wait(60)
browser.set_window_size(1920, 1080)
# 載入網頁
browser.get('https://webpacx.ksml.edu.tw/personal/')
loginid=browser.find_element(By.ID, 'logxinid')
loginid.send_keys(account)
pincode=browser.find_element(By.ID, 'pincode')
pincode.send_keys(password)
div_btn_grp=browser.find_element(By.CLASS_NAME, 'btn_grp')
login_btn=div_btn_grp.find_element(By.TAG_NAME, 'input')
login_btn.click()
# 擷取借閱紀錄
div_redblock=browser.find_element(By.CLASS_NAME, 'redblock')
div_redblock.click()
books=browser.find_elements(By.CLASS_NAME, 'bookdata')
borrow_books=[]
for book in books:
item=dict()
book_name=book.find_element(By.XPATH, './h2/a').text
item['book_name']=book_name.replace('/', '').strip()
book_site=book.find_element(By.XPATH, './ul[3]/li[1]').text
reg=r'典藏地:(\S+)'
item['book_site']=re.findall(reg, book_site)[0]
reg=r'\d{4}-\d{2}-\d{2}'
due_date=book.find_element(By.XPATH, './ul[4]/li[2]').text
item['due_date']=re.findall(reg, due_date)[0]
due_times=book.find_element(By.XPATH, './ul[5]/li[1]').text
item['due_times']=re.findall(r'\d{1}', due_times)[0]
try:
state=book.find_element(By.XPATH, './ul[6]/li[1]').text
except:
state=''
finally:
if '有人預約' in state:
item['state']=', 有人預約'
else:
item['state']=''
borrow_books.append(item)
print('擷取借閱紀錄 ... OK')
browser.back() # 回上一頁
# 擷取預約紀錄
div_blueblock=browser.find_element(By.CLASS_NAME, 'blueblock')
div_blueblock.click()
books=browser.find_elements(By.CLASS_NAME, 'bookdata')
reserve_books=[]
for book in books:
item=dict()
book_name=book.find_element(By.XPATH, './h2/a').text
item['book_name']=book_name.replace('/', '').strip()
sequence=book.find_element(By.XPATH, './ul[7]/li[1]').text
if '預約待取' in sequence: # 已到館
item['ready_for_pickup']=True
reg=r'\d{4}-\d{2}-\d{2}'
item['expiration']=re.findall(reg, sequence)[0]
item['sequence']='0'
else: # 預約中
item['ready_for_pickup']=False
item['expiration']=''
item['sequence']=re.findall(r'\d+', sequence)[0]
reserve_books.append(item)
print('擷取預約紀錄 ... OK')
result=(borrow_books, reserve_books)
except Exception as e:
print(f'發生錯誤 : {e}')
finally:
if browser:
try:
browser.quit() # 釋放記憶體
print('資源已釋放')
except:
pass
return result
if __name__ == '__main__':
start=time.time()
config=dotenv_values('.env')
TELEGRAM_TOKEN=config.get('TELEGRAM_TOKEN')
TELEGRAM_ID=config.get('TELEGRAM_ID')
#print(TELEGRAM_TOKEN)
#print(TELEGRAM_ID)
host_name=socket.gethostname()
print(f'主機 : {host_name}')
if len(sys.argv) != 3:
print(f'用法: {sys.argv[0]} 帳號 密碼')
sys.exit(1)
# 取得傳入的帳密參數
account=sys.argv[1]
password=sys.argv[2]
# 呼叫 get_books() 取得借書與預約書
borrow_books, reserve_books=get_books(account, password)
b_msg='' # 借書資訊字串初始值
r_msg='' # 預約資訊字串初始值
# 處理借書
if borrow_books:
borrow=[]
for book in borrow_books:
book_name=book['book_name']
book_site=book['book_site']
due_times=book['due_times']
due_date=book['due_date']
state=book['state']
due_date=datetime.strptime(due_date, '%Y-%m-%d') # 到期日
today_str=datetime.today().strftime('%Y-%m-%d')
today=datetime.strptime(today_str, "%Y-%m-%d")
delta=(due_date-today).days # 計算離到期日還有幾天
if delta < 0: # 負數=已逾期
msg=f'🅧 {book_name} (逾期 {abs(delta)} 天{state}, {book_site})'
borrow.append(msg)
elif delta == 0: # 0=今天到期
msg=f'⓿ {book_name} (今日到期, 續借次數 {due_times}{state}, {book_site})'
borrow.append(msg)
elif delta == 1: # 1=明天到期
msg=f'❶ {book_name} (明日到期, 續借次數 {due_times}{state}, {book_site})'
borrow.append(msg)
elif delta == 2: # 2=後天到期
msg=f'❷ {book_name} (後天到期, 續借次數 {due_times}{state}, {book_site})'
borrow.append(msg)
elif 2 < delta < 8: # 3 天以上一周內到期
msg=f'✦ {book_name} ({book["due_date"]} 到期, '\
f'續借次數 {due_times}{state}, {book_site})'
borrow.append(msg)
# 製作借書到期摘要字串
if len(borrow) != 0:
borrow.insert(0, f'\n❖ {account} 的借閱 :')
b_msg='\n'.join(borrow) # 更新借書資訊字串
print('產生借書到期摘要 ... OK')
# 處理預約書
if reserve_books:
reserve=[]
i=0
j=['①', '②', '③', '④', '⑤']
k=['❶', '❷', '❸', '❹', '❺']
# 預約狀態
for book in reserve_books:
book_name=book['book_name']
sequence=book['sequence']
ready_for_pickup=book['ready_for_pickup'] # 已到館
expiration=book['expiration'] # 取書截止日
if ready_for_pickup:
msg=f'{k[i]} {book_name} (已到館, 保留期限 {expiration})'
else:
msg=f'{j[i]} {book_name} (順位 {sequence})'
reserve.append(msg)
i += 1
# 製作預約書摘要字串
if len(reserve) != 0:
reserve.insert(0, f'\n❖ {account} 的預約 :')
r_msg='\n'.join(reserve) # 更新資訊字串
print('產生預約書摘要 ... OK')
if b_msg or r_msg: # 任一不為空字串就更新資料表
url="https://serverless-5e6i.onrender.com/function/update_ksml_books"
payload={
"account": account,
"borrow_books": b_msg,
"reserve_books": r_msg
}
res=requests.post(url, json=payload)
print(res.json())
end=time.time()
print(f'執行時間:{end-start}')
此次改版也修正了 try-except-finally 結構, 把 browser.close() 改成 browser.quit(), 前者雖然會把視窗關掉, 但背景的 chromedriver 可能還在跑繼續吃 RAM, 導致背景殘留了一堆 chromedriver 的殭屍進程. 此優化版程式在 Pi 3B, 3A+ 與 Pi 400 均可順利執行.
執行結果如下 :
pi@kaopi3:~ $ python ksml_lib_12.py faxxxxxx 123456
擷取借閱紀錄 ... OK
擷取預約紀錄 ... OK
產生借書到期摘要 ... OK
產生預約書摘要 ... OK
{'message': 'faxxxxxx 的資料已更新', 'status': 'success'}
執行時間:667.9564106464386
pi@pi3aplus:~ $ python ksml_lib_12.py faxxxxxx 123456
主機 : pi3aplus
擷取借閱紀錄 ... OK
擷取預約紀錄 ... OK
資源已釋放
產生借書到期摘要 ... OK
產生預約書摘要 ... OK
{'message': 'faxxxxxx 的資料已更新', 'status': 'success'}
執行時間:675.6979095935822
(myenv313) pi@pi400:~ $ python ksml_lib_12.py faxxxxxx 123456
主機 : pi400
擷取借閱紀錄 ... OK
擷取預約紀錄 ... OK
資源已釋放
產生借書到期摘要 ... OK
產生預約書摘要 ... OK
{'message': 'faxxxxxx 的資料已更新', 'status': 'success'}
執行時間:573.2406423091888
可見 Pi 3B 與 3A+ 速度差不多, 但 Pi 400 就快了 100 秒.
注意, 此處呼叫的後端端點有兩個, 都是建置在 Render 平台上的 serverless 服務, 目前我有兩個端點, 分配如下 :
- kaopi3 : https://serverless-5e6i.onrender.com/function/send_books_messages
- pi3aplus : https://serverless-fdof.onrender.com/function/send_books_messages
- pi400 : https://serverless-fdof.onrender.com/function/send_books_messages (備用)
擷取並傳送借書資訊的程式 get_ksml_books_messages.py 也是要設定對應端點, 例如 kaopi3 :
# get_ksml_books_messages.py
import requests
import socket
host_name=socket.gethostname()
print(f'主機 : {host_name}')
params={'crawler': f'{host_name}'}
url='https://serverless-5e6i.onrender.com/function/send_books_messages'
res=requests.get(url, params=params)
print(res)
此爬蟲程式會呼叫 serverless 平台上的 send_books_messages.py 函式讀取 serverless.db 上記錄的借書與預約資訊, 並送出 Telegram 訊息. send_books_messages.py 函式內容如下 :
# send_ksml_books_messages.py
import asyncio
import sqlite3
from telegram import Bot
async def telegram_send_text(token, chat_id, text):
"""非同步傳送 Telegram 訊息"""
try:
bot=Bot(token=token)
await bot.send_message(chat_id=chat_id, text=text)
return True
except Exception as e:
print(f"傳送失敗: {e}")
return False
def main(request, **kwargs):
DB_PATH='./serverless.db'
config=kwargs.get('config', {})
telegram_token=config.get('TELEGRAM_TOKEN')
telegram_id=config.get('TELEGRAM_ID')
if not telegram_token or not telegram_id:
return '未設定 TELEGRAM_TOKEN 或 TELEGRAM_ID'
try: # 連線資料庫
conn=sqlite3.connect(DB_PATH)
cur=conn.cursor()
cur.execute("SELECT borrow_books, reserve_books FROM ksml_books;")
rows=cur.fetchall()
conn.close()
except Exception as e:
return f'資料庫讀取失敗: {e}'
if not rows:
return '沒有任何資料可傳送'
# 傳送訊息
success_count=0
fail_count=0
for borrow_books, reserve_books in rows:
for msg in [borrow_books, reserve_books]:
if msg and msg.strip():
ok=asyncio.run(telegram_send_text(telegram_token, telegram_id, msg))
if ok:
success_count += 1
else:
fail_count += 1
return f'傳送完成:成功 {success_count} 筆,失敗 {fail_count} 筆'
市圖爬蟲架構較複雜, 得畫一張圖來備忘才行.
沒有留言 :
張貼留言