爬取网站

爬取网站

爬取网站所有文字

import requests
from bs4 import BeautifulSoup

# 要爬取的网页URL
url = "https://ksmlc.steam.cf"  # 请将其替换为你要爬取的网页URL

# 发起HTTP请求
response = requests.get(url)

# 检查是否成功获取网页
if response.status_code == 200:
    # 解析网页内容
    soup = BeautifulSoup(response.text, 'html.parser')

    # 提取所有的文字
    all_text = soup.get_text()

    # 去掉空格
    all_text_without_spaces = ' '.join(all_text.split())

    # 打印去掉空格的文本
    print(all_text_without_spaces)
else:
    print("无法访问该网页")

爬取网页源码&文字(GUI)

import requests
import tkinter as tk
from tkinter import ttk
from tkinter import scrolledtext
from tkinter import messagebox
from bs4 import BeautifulSoup


def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69'
    }
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = response.text
    return html


def crawl_website():
    url = url_entry.get()
    crawl_option = crawl_option_var.get()

    try:
        output_text.config(state=tk.NORMAL)  # 将文本框设置为可编辑状态

        if crawl_option == "网页源码":
            html = get_html(url)
            output_text.delete(1.0, tk.END)
            output_text.insert(tk.END, html)
            messagebox.showinfo("成功", "网站源码已成功获取并显示在下方文本框中。")
        elif crawl_option == "网页文字":
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                all_text = soup.get_text()
                all_text_without_spaces = ' '.join(all_text.split())
                output_text.delete(1.0, tk.END)
                output_text.insert(tk.END, all_text_without_spaces)
                messagebox.showinfo("成功", "网站已成功解析并显示在下方文本框中.")

            else:
                messagebox.showerror("错误", "无法访问该网页")
    except Exception as e:
        messagebox.showerror("错误", f"爬取失败: {e}")
    finally:
        # 将文本框设置为不可编辑状态,稍后执行
        output_text.after(10, lambda: output_text.config(state=tk.DISABLED))


# 创建主窗口
root = tk.Tk()
root.title("网站爬虫")

# 创建并布局控件
url_label = ttk.Label(root, text="请输入网站URL:")
url_label.grid(column=0, row=0, padx=10, pady=10, sticky=tk.W)

url_entry = ttk.Entry(root, width=40)
url_entry.grid(column=1, row=0, padx=10, pady=10, sticky=tk.W)

crawl_option_label = ttk.Label(root, text="选择爬取方式:")
crawl_option_label.grid(column=0, row=2, padx=10, pady=10, sticky=tk.W)

crawl_options = ["", "网页源码", "网页文字"]  # 空白选项
crawl_option_var = tk.StringVar(root)
crawl_option_var.set(crawl_options[0])  # 设置默认值
crawl_option_menu = ttk.OptionMenu(root, crawl_option_var, *crawl_options)
crawl_option_menu.grid(column=1, row=2, padx=10, pady=10, sticky=tk.W)

crawl_button = ttk.Button(root, text="爬取网站", command=crawl_website)
crawl_button.grid(column=2, row=0, padx=10, pady=10, sticky=tk.W)

output_text = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=80, height=20, state=tk.DISABLED)
output_text.grid(column=0, row=3, columnspan=3, padx=10, pady=10)

# 启动主循环
root.mainloop()

GUI打包成品

© 版权声明
THE END
喜欢就支持一下吧
点赞0 分享
评论 抢沙发
头像
欢迎您留下宝贵的见解!
提交
头像

昵称

取消
昵称表情代码图片

    暂无评论内容