爬取网站
爬取网站所有文字
import requests
from bs4 import BeautifulSoup
# 要爬取的网页URL
url = "https://ksmlc.steam.cf" # 请将其替换为你要爬取的网页URL
# 发起HTTP请求
response = requests.get(url)
# 检查是否成功获取网页
if response.status_code == 200:
# 解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取所有的文字
all_text = soup.get_text()
# 去掉空格
all_text_without_spaces = ' '.join(all_text.split())
# 打印去掉空格的文本
print(all_text_without_spaces)
else:
print("无法访问该网页")
爬取网页源码&文字(GUI)
import requests
import tkinter as tk
from tkinter import ttk
from tkinter import scrolledtext
from tkinter import messagebox
from bs4 import BeautifulSoup
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69'
}
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
return html
def crawl_website():
url = url_entry.get()
crawl_option = crawl_option_var.get()
try:
output_text.config(state=tk.NORMAL) # 将文本框设置为可编辑状态
if crawl_option == "网页源码":
html = get_html(url)
output_text.delete(1.0, tk.END)
output_text.insert(tk.END, html)
messagebox.showinfo("成功", "网站源码已成功获取并显示在下方文本框中。")
elif crawl_option == "网页文字":
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
all_text = soup.get_text()
all_text_without_spaces = ' '.join(all_text.split())
output_text.delete(1.0, tk.END)
output_text.insert(tk.END, all_text_without_spaces)
messagebox.showinfo("成功", "网站已成功解析并显示在下方文本框中.")
else:
messagebox.showerror("错误", "无法访问该网页")
except Exception as e:
messagebox.showerror("错误", f"爬取失败: {e}")
finally:
# 将文本框设置为不可编辑状态,稍后执行
output_text.after(10, lambda: output_text.config(state=tk.DISABLED))
# 创建主窗口
root = tk.Tk()
root.title("网站爬虫")
# 创建并布局控件
url_label = ttk.Label(root, text="请输入网站URL:")
url_label.grid(column=0, row=0, padx=10, pady=10, sticky=tk.W)
url_entry = ttk.Entry(root, width=40)
url_entry.grid(column=1, row=0, padx=10, pady=10, sticky=tk.W)
crawl_option_label = ttk.Label(root, text="选择爬取方式:")
crawl_option_label.grid(column=0, row=2, padx=10, pady=10, sticky=tk.W)
crawl_options = ["", "网页源码", "网页文字"] # 空白选项
crawl_option_var = tk.StringVar(root)
crawl_option_var.set(crawl_options[0]) # 设置默认值
crawl_option_menu = ttk.OptionMenu(root, crawl_option_var, *crawl_options)
crawl_option_menu.grid(column=1, row=2, padx=10, pady=10, sticky=tk.W)
crawl_button = ttk.Button(root, text="爬取网站", command=crawl_website)
crawl_button.grid(column=2, row=0, padx=10, pady=10, sticky=tk.W)
output_text = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=80, height=20, state=tk.DISABLED)
output_text.grid(column=0, row=3, columnspan=3, padx=10, pady=10)
# 启动主循环
root.mainloop()
GUI打包成品
© 版权声明
- 本站永久网址:https://blog.ksmlc.cn/
- 本站一切资源不代表本站立场,并不代表本站赞同其观点和对其真实性负责
- 本站资源大多存储在云盘,如发现链接失效,请联系我们我们会第一时间更新
- 本网站的文章部分内容可能来源于网络,仅供大家学习与参考,如有侵权,请联系站长QQ:2760460838进行删除处理
THE END
暂无评论内容