web智能体中的pagesource怎样修改?有没有成熟的pagesource源码可以参考

源码如下:
selenium_tools.py

import json
from pathlib import Path
from typing import Optional, Dict, Any, Tuple
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.common.exceptions import (
    NoSuchElementException, ElementClickInterceptedException,
    WebDriverException, TimeoutException
)

from selenium.webdriver.chrome.service import Service


class WebAutoFramework:

    def __init__(self, headless: bool = False, max_token_length: int = 8000):
        self.driver: Optional[WebDriver] = None
        self.max_token_length = max_token_length
        self.headless = headless
        self._initialize_driver()
        self.element = None
        # 扩展需要收集的标签类型
        self.target_tags = [
            'button', 'input', 'select', 'textarea',  # 表单元素
            'a', 'span', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',  # 文本与链接
            'div', 'section', 'article', 'header', 'footer',  # 布局元素
            'table', 'tr', 'td', 'th', 'ul', 'ol', 'li',  # 列表与表格
            'img', 'video', 'audio'  # 媒体元素
        ]

    def _initialize_driver(self) -> None:
        try:
            options = webdriver.ChromeOptions()
            if self.headless:
                options.add_argument('--headless=new')

            # 增强浏览器稳定性参数
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-gpu')
            options.add_argument('--ignore-certificate-errors')
            options.add_argument('--disable-dev-shm-usage')

            # 驱动路径(确保正确)
            driver_path = r"D:\chrome_download\chrome_driver\chromedriver-win64\chromedriver.exe"

            # 验证驱动存在性
            if not Path(driver_path).exists():
                raise FileNotFoundError(f"ChromeDriver不存在: {driver_path}")

            service = Service(executable_path=driver_path)
            self.driver = webdriver.Chrome(
                service=service,
                options=options
            )
            self.driver.set_page_load_timeout(30)
            self.driver.implicitly_wait(10)

        except WebDriverException as e:
            raise RuntimeError(f"初始化浏览器失败: {str(e)}")

    # def __init__(self):
    #     self.driver = None
    #     self.element = None
    #
    # def init(self):
    #     if not self.driver:
    #         self.driver = webdriver.Chrome()
    #         self.driver.implicitly_wait(5)

    def open(self, url):

        self.driver.get(url)
        return self.source()

    def source(self):
        return self.driver.execute_script(
            """
            var content="";
            document.querySelectorAll('button').forEach(x=> content+=x.outerHTML);
            document.querySelectorAll('input').forEach(x=> content+=x.outerHTML);
            //document.querySelectorAll('li').forEach(x=> content+=x.outerHTML);
            //document.querySelectorAll('table').forEach(x=> content+=x.outerHTML);
            return content;
            """
        )

    def click(self):
        """
        点击当前的元素
        :return:
        """
        self.element.click()

        return self.source()

    def send_keys(self, text):
        self.element.clear()
        self.element.send_keys(text)
        return self.source()

    def find(self, locator):
        print(f"find css = {locator}")
        element = self.driver.find_element(by=By.CSS_SELECTOR, value=locator)
        self.element = element
        return self.source()

    def maximizeWindow(self):
        """
        浏览器最大化
        :return:
        """

        self.driver.maximize_window()
        return self.source()

    def quit(self):
        # self.driver.quit()
        pass

    def get_current_url(self):
        print(f"当前的url为{self.driver.current_url}")
        return self.driver.current_url

selenium_agent.py如下

import time
from hogwarts_logger import info
from langchain.globals import set_verbose


from langchain_core.globals import set_debug

from langgraph.prebuilt.chat_agent_executor import AgentState, create_react_agent


from testcase_gen.llm import model_chatpt, model_ollama

set_debug(True)
set_verbose(True)
from langchain_core.tools import tool

from langraph_agent.selenium_tools_ad import WebAutoFramework

web = WebAutoFramework()
@tool
def open(url: str):
    """
    使用浏览器打开特定的url,并返回网页内容
    """
    r = web.open(url)
    return r

@tool
def find(css: str):
    """定位网页元素"""
    return web.find(css)

@tool
def click(css: str = None):
    """以css的方式定位网页元素后点击"""
    web.find(css)
    return web.click()

@tool
def send_keys(css, text):
    """定位到css指定的元素,并输入text"""
    web.find(css)
    return web.send_keys(text)

@tool
def sleep(seconds: int):
    """等待指定的秒数"""
    time.sleep(seconds)


@tool
def quit():
    """退出浏览器"""
    web.quit()

@tool
def get_current_url():
    """获取当前的url"""
    return web.get_current_url()

@tool
def maximizeWindow():
    """
    浏览器最大化
    :return:
    """
    return web.maximizeWindow()

tools = [open, quit, get_current_url, find, click, send_keys,maximizeWindow]

web_agent = create_react_agent(model=model_ollama,
                               tools=tools,
                               prompt="你是一个测试工程师。"
                                      "每次只返回包含一个action的工具调用。"
                                      "每次只返回一个tool_call action。"
                                      "每次只返回一个function_call action。")


def test_page_source_v3():
    # 构建符合AgentState要求的输入
    state = {
        "messages": [{
            "role": "user",
            "content": "1. 浏览器最大化后打开 https://litemall.hogwarts.ceshiren.com/#/login?redirect=%2Fdashboard "
                       "2. 输入用户名 hogwarts"
                       " 3. 输入密码 test12345"
                       " 4. 点击登录按钮"
                       " 5. 进入首页后"
                        "6.点击商品管理,点击商品列表,点击添加按钮"

        }],
        "tools": tools,
        "tool_names": [tool.name for tool in tools]
    }
    r = web_agent.invoke(state,config={"recursion_limit": 50})
    info(r)

问题

无法定位到登录按钮,有时候可以定位登录按钮,有时候又定位不到;

报错信息

{
  "model":"qwen2.5:latest","stream":true,"options": {
    
  }
  ,"messages":[ {
    "role":"system","content":"你是一个测试工程师。每次只返回包含一个action的工具调用。每次只返回一个tool_call action。每次只返回一个function_call action。"
  }
  , {
    "role":"user","content":"1. 浏览器最大化后打开 https://litemall.hogwarts.ceshiren.com/#/login?redirect=%2Fdashboard 2. 输入用户名 hogwarts 3. 输入密码 test12345 4. 点击登录按钮 5. 进入首页后6.点击商品管理,点击商品列表,点击添加按钮"
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"maximizeWindow","arguments": {
          
        }
        
      }
      
    }
    , {
      "function": {
        "name":"open","arguments": {
          "url":"https://litemall.hogwarts.ceshiren.com/#/login?redirect=%2Fdashboard"
        }
        
      }
      
    }
    , {
      "function": {
        "name":"send_keys","arguments": {
          "css":".username-input","text":"hogwarts"
        }
        
      }
      
    }
    , {
      "function": {
        "name":"send_keys","arguments": {
          "css":".password-input","text":"test12345"
        }
        
      }
      
    }
    , {
      "function": {
        "name":"click","arguments": {
          "css":".login-button"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"<button data-v-31756f19=\"\" type=\"button\" class=\"el-button el-button--primary el-button--mini\" style=\"width: 100%; margin-bottom: 30px;\"><!----><!----><span>登录</span></button><input tabindex=\"1\" type=\"text\" autocomplete=\"on\" name=\"username\" placeholder=\"管理员账户\" class=\"el-input__inner\"><input tabindex=\"2\" type=\"password\" autocomplete=\"on\" name=\"password\" placeholder=\"管理员密码\" class=\"el-input__inner\">"
  }
  , {
    "role":"tool","content":"<button data-v-31756f19=\"\" type=\"button\" class=\"el-button el-button--primary el-button--mini\" style=\"width: 100%; margin-bottom: 30px;\"><!----><!----><span>登录</span></button><input tabindex=\"1\" type=\"text\" autocomplete=\"on\" name=\"username\" placeholder=\"管理员账户\" class=\"el-input__inner\"><input tabindex=\"2\" type=\"password\" autocomplete=\"on\" name=\"password\" placeholder=\"管理员密码\" class=\"el-input__inner\">"
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".login-button"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"find","arguments": {
          "css":".login-button"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"<button data-v-31756f19=\"\" type=\"button\" class=\"el-button el-button--primary el-button--mini is-loading\" style=\"width: 100%; margin-bottom: 30px;\" disabled=\"disabled\"><i class=\"el-icon-loading\"></i><!----><span>登录</span></button><input tabindex=\"1\" type=\"text\" autocomplete=\"on\" name=\"username\" placeholder=\"管理员账户\" class=\"el-input__inner\"><input tabindex=\"2\" type=\"password\" autocomplete=\"on\" name=\"password\" placeholder=\"管理员密码\" class=\"el-input__inner\">"
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini.is-loading"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: ElementClickInterceptedException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini[disabled='disabled'] + input[type='text'][name='username']"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"send_keys","arguments": {
          "css":".el-input__inner[name='username']","text":"hogwarts"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"<button data-v-31756f19=\"\" type=\"button\" class=\"el-button el-button--primary el-button--mini is-loading\" style=\"width: 100%; margin-bottom: 30px;\" disabled=\"disabled\"><i class=\"el-icon-loading\"></i><!----><span>登录</span></button><input tabindex=\"1\" type=\"text\" autocomplete=\"on\" name=\"username\" placeholder=\"管理员账户\" class=\"el-input__inner\"><input tabindex=\"2\" type=\"password\" autocomplete=\"on\" name=\"password\" placeholder=\"管理员密码\" class=\"el-input__inner\">"
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"send_keys","arguments": {
          "css":".el-input__inner[name='password']","text":"test12345"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"<button data-v-31756f19=\"\" type=\"button\" class=\"el-button el-button--primary el-button--mini is-loading\" style=\"width: 100%; margin-bottom: 30px;\" disabled=\"disabled\"><i class=\"el-icon-loading\"></i><!----><span>登录</span></button><input tabindex=\"1\" type=\"text\" autocomplete=\"on\" name=\"username\" placeholder=\"管理员账户\" class=\"el-input__inner\"><input tabindex=\"2\" type=\"password\" autocomplete=\"on\" name=\"password\" placeholder=\"管理员密码\" class=\"el-input__inner\">"
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini.is-loading"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: ElementClickInterceptedException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini[disabled='disabled']"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: ElementClickInterceptedException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"find","arguments": {
          "css":".el-button--primary.el-button--mini.is-loading + .el-button--primary.el-button--mini"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini[disabled='disabled'] + .el-button--primary.el-button--mini"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini[disabled='disabled'] + .el-button--primary.el-button--mini"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini[disabled='disabled'] + .el-button--primary.el-button--mini"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: ElementClickInterceptedException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"find","arguments": {
          "css":"#home-page"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: ElementClickInterceptedException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"find","arguments": {
          "css":"#home-page"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: ElementClickInterceptedException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"find","arguments": {
          "css":"#home-page"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-menu-item[title='商品管理']"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"find","arguments": {
          "css":".el-submenu__item[title='商品管理'] .el-menu-item[title='商品列表']"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-submenu__item[title='商品管理'] .el-menu-item[title='商品列表']"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  , {
    "role":"assistant","tool_calls":[ {
      "function": {
        "name":"click","arguments": {
          "css":".el-button--primary.el-button--mini[title='添加']"
        }
        
      }
      
    }
    ]
  }
  , {
    "role":"tool","content":"Error: NoSuchElementException()\n Please fix your mistakes."
  }
  ],"tools":[ {
    "type":"function","function": {
      "name":"open","description":"使用浏览器打开特定的url,并返回网页内容","parameters": {
        "type":"object","required":["url"],"properties": {
          "url": {
            "type":"string"
          }
          
        }
        
      }
      
    }
    
  }
  , {
    "type":"function","function": {
      "name":"quit","description":"退出浏览器","parameters": {
        "type":"object","properties": {
          
        }
        
      }
      
    }
    
  }
  , {
    "type":"function","function": {
      "name":"get_current_url","description":"获取当前的url","parameters": {
        "type":"object","properties": {
          
        }
        
      }
      
    }
    
  }
  , {
    "type":"function","function": {
      "name":"find","description":"定位网页元素","parameters": {
        "type":"object","required":["css"],"properties": {
          "css": {
            "type":"string"
          }
          
        }
        
      }
      
    }
    
  }
  , {
    "type":"function","function": {
      "name":"click","description":"以css的方式定位网页元素后点击","parameters": {
        "type":"object","properties": {
          "css": {
            "type":"string"
          }
          
        }
        
      }
      
    }
    
  }
  , {
    "type":"function","function": {
      "name":"send_keys","description":"定位到css指定的元素,并输入text","parameters": {
        "type":"object","required":["css","text"],"properties": {
          "css": {
            
          }
          ,"text": {
            
          }
          
        }
        
      }
      
    }
    
  }
  , {
    "type":"function","function": {
      "name":"maximizeWindow","description":"浏览器最大化\n:return:","parameters": {
        "type":"object","properties": {
          
        }

环境

市面上最常见的web操作工具 browser-use中有提供一套js代码,它里面有不少的过滤规则可以参考一下。当然没必要完全搬来用,毕竟它最终的实现方式还有点特殊,但是里面一些过滤的逻辑和方法可以参考。如果不太理解的话可以让大模型给那个js增加一些注释或者帮你挑选一些可用的逻辑搬运过来。

参考了browser use的js后发现返回的页面源码超token
如何让token精讲 ?
以下代码是让ai精简的代码,可是ai会产生幻觉,乱猜测一个定位元素,如何修改?

   def page_source_v3(self) -> Dict[str, Any]:
        if not self.driver:
            return {"error": "浏览器未初始化"}

        try:
            raw_content = self.driver.execute_script(f"let f={JS_CONTENT};return f();")
            print(f"返回的page_source_v3为{raw_content}")
            print(f"精简后的内容如下:",self._optimize_content(raw_content))
            return self._optimize_content(raw_content)
        except Exception as e:
            return {"error": f"获取页面元素信息失败: {str(e)}"}

    def _estimate_tokens(self, content: str) -> int:
        chinese_chars = sum(1 for c in content if '\u4e00' <= c <= '\u9fff')
        other_chars = len(content) - chinese_chars
        return int(chinese_chars / 2 + other_chars / 4)

    def _truncate_content(self, content: str) -> str:
        if self._estimate_tokens(content) <= self.max_token_length:
            return content

        max_chars = self.max_token_length * 3
        return content[:max_chars] + "\n...[内容已截断以符合token限制]"

    # def _optimize_content(self, content: Any) -> Any:
    #     if isinstance(content, dict):
    #         optimized = {}
    #         important_keys = ['tag', 'id', 'class', 'text', 'href', 'visible', 'clickable', 'children','name']
    #
    #         for key, value in content.items():
    #             if key not in important_keys:
    #                 continue
    #
    #             if key == 'children' and isinstance(value, list):
    #                 optimized_children = [self._optimize_content(child) for child in value[:50]]
    #                 optimized[key] = optimized_children
    #             elif key == 'text' and isinstance(value, str):
    #                 optimized[key] = value[:300] + ("..." if len(value) > 300 else "")
    #             else:
    #                 optimized[key] = value
    #
    #         if self._estimate_tokens(str(optimized)) > self.max_token_length:
    #             optimized = self._further_simplify(optimized)
    #
    #         return optimized
    #
    #     elif isinstance(content, list):
    #         return [self._optimize_content(item) for item in content[:30]]
    #
    #     return content
    def _optimize_content(self, content: Any) -> Any:
        if isinstance(content, dict):
            optimized = {}
            important_keys = ['tag', 'id', 'class', 'text', 'href', 'visible', 'clickable', 'children', 'name',
                              'tagName', 'xpath']

            # 针对input元素:从attributes中提取name属性到顶层
            if content.get('tagName') == 'input' and 'attributes' in content:
                # 将name属性提升到顶层,方便大模型直接识别
                optimized['name'] = content['attributes'].get('name')
                # 同时保留原始attributes中的关键属性(可选)
                if 'class' in content['attributes']:
                    optimized['class'] = content['attributes']['class']

            # 处理其他关键属性
            for key, value in content.items():
                # 跳过已经处理过的name属性(避免重复)
                if key == 'name' and 'name' in optimized:
                    continue

                if key not in important_keys:
                    continue

                # 特殊处理children字段(限制数量并递归优化)
                if key == 'children' and isinstance(value, list):
                    # 限制子元素数量为前50个,避免数据量过大
                    optimized_children = [self._optimize_content(child) for child in value[:50]]
                    optimized[key] = optimized_children
                # 特殊处理text字段(截断过长文本)
                elif key == 'text' and isinstance(value, str):
                    # 文本长度限制为300字符,超过则添加省略号
                    optimized[key] = value[:300] + ("..." if len(value) > 300 else "")
                # 处理tagName时统一转为小写(如INPUT -> input)
                elif key == 'tagName' and isinstance(value, str):
                    optimized[key] = value.lower()
                # 保留其他关键属性
                else:
                    optimized[key] = value

            # 若优化后内容仍超过token限制,进一步简化
            if self._estimate_tokens(str(optimized)) > self.max_token_length:
                optimized = self._further_simplify(optimized)

            return optimized

        elif isinstance(content, list):
            # 限制列表长度为前30个元素,避免顶层数据量过大
            return [self._optimize_content(item) for item in content[:30]]

        # 非字典/列表类型直接返回
        return content

    def _further_simplify(self, content: Dict[str, Any]) -> Dict[str, Any]:
        if 'children' in content and isinstance(content['children'], list):
            content['children'] = content['children'][:len(content['children']) // 2]
            content['warning'] = "内容已大幅简化以符合token限制"
        return content