1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
| import requests from bs4 import BeautifulSoup from urllib.parse import urljoin
url_list = ['https://www.sdtbu.edu.cn/info/1043/35641.htm','https://www.sdtbu.edu.cn/info/1043/35612.htm']
def fetch_and_parse(url): response = requests.get(url,timeout=5) response.raise_for_status() response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') title = soup.find_all('title') print(f"页面标题:",title[0].text)
content_box = soup.find('div', class_='content-box') if not content_box: print("未找到class为'content-box'的div元素") return
img_tags = content_box.find_all('img') if not img_tags: print("在content-box中未找到任何图片") return
print(f"共找到{len(img_tags)}张图片。")
for i, img in enumerate(img_tags, 1): img_url = img.get('src') if not img_url: print(f"图片 {i}: 未找到src属性,跳过") continue full_img_url = urljoin(url, img_url)
try: img_response = requests.get(full_img_url, stream=True, timeout=5) img_response.raise_for_status() content_length = img_response.headers.get('content-length')
if content_length: size_bytes = int(content_length)
else: size_bytes = len(img_response.content)
if size_bytes >= 1024 * 1024: size_str = f"{size_bytes / (1024 * 1024):.2f} MB"
else: size_str = f"{size_bytes / 1024:.2f} KB" print(f"图片{i}: {size_str}({full_img_url})")
except requests.exceptions.RequestException as e: print(f"下载图片失败: {str(e)}") except Exception as e: print(f"处理图片时发生错误: {str(e)}")
if __name__=="__main__": for url in url_list: fetch_and_parse(url)
|