BeautifulSoup

用网络库下载到页面源码后,就是去解析(HTML等)内容了。

Python中最常用的HTML(和XML)解析库之一就是:BeautifulSoup

从html转soup

from bs4 import BeautifulSoup

def htmlToSoup(curHtml):
    """convert html to soup

    Args:
        curHtml (str): html str
    Returns:
        soup
    Raises:
    """
    soup = BeautifulSoup(curHtml, 'html.parser')
    return soup

从xml转换出soup

背景:

iOS自动化期间,常会涉及到,获取到当前页面源码,是xml字符串,需要转换为soup,才能后续操作

所以整理出通用转换逻辑

def xmlToSoup(xmlStr):
    """convert to xml string to soup
        Note: xml is tag case sensitive -> retain tag upper case -> NOT convert tag to lowercase


    Args:
        xmlStr (str): xml str, normally page source
    Returns:
        soup
    Raises:
    """
    # HtmlParser = 'html.parser'
    # XmlParser = 'xml'
    XmlParser = 'lxml-xml'
    curParser = XmlParser
    soup = BeautifulSoup(xmlStr, curParser)
    return soup

举例:

(1)

    curPageXml = self.get_page_source()
    soup = CommonUtils.xmlToSoup(curPageXml)

获取到xml字符串后,去转换为soup

soup转html


def soupToHtml(soup, isFormat=True):
    """Convert soup to html string

    Args:
        soup (Soup): BeautifulSoup soup
        isFormat (bool): use prettify to format html
    Returns:
        html (str)
    Raises:
    """
    if isFormat:
        curHtml = soup.prettify() # formatted html
    else:
        curHtml = str(soup) # not formatted html
    return curHtml

获取soup节点所有的文字内容


def getAllContents(curNode, isStripped=False):
    """Get all contents of current and children nodes

    Args:
        curNode (soup node): current Beautifulsoup node
        isStripped (bool): return stripped string or not
    Returns:
        str
    Raises:
    """
    # codeSnippetStr = curNode.prettify()
    # codeSnippetStr = curNode.string
    # codeSnippetStr = curNode.contents
    codeSnippetStr = ""
    stringList = []
    if isStripped:
        stringGenerator = curNode.stripped_strings
    else:
        stringGenerator = curNode.strings

    # stringGenerator = curNode.strings
    for eachStr in stringGenerator:
        # logging.debug("eachStr=%s", eachStr)
        stringList.append(eachStr)
    codeSnippetStr = "\n".join(stringList)
    logging.debug("codeSnippetStr=%s", codeSnippetStr)
    return codeSnippetStr

从html中提取title值

def extractHtmlTitle_BeautifulSoup(htmlStr):
    """
    Extract title from html, use BeautifulSoup

    Args:
        htmlStr (str): html string
    Returns:
        str
    Raises:
    Examples:
    """
    curTitle = ""

    soup = BeautifulSoup(htmlStr, "html.parser")
    if soup:
        if soup.title and soup.title.string:
            curTitle = soup.title.string
            curTitle = curTitle.strip()
        else:
            # logging.warning("Empty title for html: %s", htmlStr)
            logging.debug("Empty title for html: %s", htmlStr)
            # Empty title for html: <script type="text/javascript">top.location.href='https://login.zhongan.com/passport/login.htm?sourceApp=1&target=http://www.zhongan.com/open/member/loginJump?logincallback=%2Fahita';</script>

            # for debug
            if "<title>" not in htmlStr:
                logging.warning("Special not incldue <title> html: %s", htmlStr)
                # 'Illegal access address!\n'
                # <script type="text/javascript">top.location.href='https://login.zhongan.com/passport/login.htm?sourceApp=1&target=http://www.zhongan.com/open/member/loginJump?logincallback=%2Fahita';</script>
                # 
    else:
        logging.error("Failed to convert to soup for html: %s", htmlStr)
        # 

    return curTitle

是否包含符合特定条件的soup节点

def isContainSpecificSoup(soupList, elementName, isSizeValidCallback, matchNum=1):
    """
        判断BeautifulSoup的soup的list中,是否包含符合条件的特定的元素:
            只匹配指定个数的元素才视为找到了
            元素名相同
            面积大小是否符合条件
    Args:
        elementName (str): element name
        isSizeValidCallback (function): callback function to check whether element size(width * height) is valid or not
        matchNum (int): sould only matched specific number consider as valid
    Returns:
        bool
    Raises:
    """
    isFound = False

    matchedSoupList = []

    for eachSoup in soupList:
        # if hasattr(eachSoup, "tag"):
        if hasattr(eachSoup, "name"):
            # curSoupTag = eachSoup.tag
            curSoupTag = eachSoup.name
            if curSoupTag == elementName:
                if hasattr(eachSoup, "attrs"):
                    soupAttr = eachSoup.attrs
                    soupWidth = int(soupAttr["width"])
                    soupHeight = int(soupAttr["height"])
                    curSoupSize = soupWidth * soupHeight # 326 * 270
                    isSizeValid = isSizeValidCallback(curSoupSize)
                    if isSizeValid:
                        matchedSoupList.append(eachSoup)

    matchedSoupNum = len(matchedSoupList)
    if matchNum == 0:
        isFound = True
    else:
        if matchedSoupNum == matchNum:
            isFound = True

    return isFound

说明:

判断soup内,是否有符合特定条件的soup

举例:

(1)iOS的弹框,有上角带关闭按钮时,去判断一个弹框,是否符合对应条件,以便于判断是否可能是弹框

nextSiblingeSoupGenerator = possibleCloseSoup.next_siblings
nextSiblingeSoupList = list(nextSiblingeSoupGenerator)

hasLargeImage = CommonUtils.isContainSpecificSoup(nextSiblingeSoupList, "XCUIElementTypeImage", self.isPopupWindowSize)
isPossibleClose = hasLargeImage

相关函数

def isPopupWindowSize(self, curSize):
    """判断一个soup的宽高大小是否是弹框类窗口(Image,Other等)的大小"""
    # global FullScreenSize
    FullScreenSize = self.X * self.totalY
    curSizeRatio = curSize / FullScreenSize # 0.289
    PopupWindowSizeMinRatio = 0.25
    # PopupWindowSizeMaxRatio = 0.9
    PopupWindowSizeMaxRatio = 0.8
    # isSizeValid = curSizeRatio >= MinPopupWindowSizeRatio
    # is popup like window, size should large enough, but should not full screen
    isSizeValid = PopupWindowSizeMinRatio <= curSizeRatio <= PopupWindowSizeMaxRatio
    return isSizeValid

(2)

hasNormalButton = CommonUtils.isContainSpecificSoup(nextSiblingeSoupList, "XCUIElementTypeButton", self.isNormalButtonSize)

相关函数:

def isNormalButtonSize(self, curSize):
    """判断一个soup的宽高大小是否是普通的按钮大小"""
    NormalButtonSizeMin = 30*30
    NormalButtonSizeMax = 100*100
    isNormalSize = NormalButtonSizeMin <= curSize <= NormalButtonSizeMax
    return isNormalSize

查找元素,限定条件是符合对应的几级的父元素的条件

背景:

很多时候,需要对于iOS的app的页面的源码,即xml中,查找符合特定情况的的元素

这些特定情况,往往是parent或者前几层级的parent中,元素符合一定条件,往往是type,以及宽度是屏幕宽度,高度是屏幕高度等等

所以提取出公共函数,用于bs的find查找元素

def bsChainFind(curLevelSoup, queryChainList):
    """BeautifulSoup find with query chain

    Args:
        curLevelSoup (soup): BeautifulSoup
        queryChainList (list): str list of all level query dict
    Returns:
        soup
    Raises:
    Examples:
        input: 
            [
                {
                    "tag": "XCUIElementTypeWindow",
                    "attrs": {"visible":"true", "enabled":"true", "width": "%s" % ScreenX, "height": "%s" % ScreenY}
                },
                {
                    "tag": "XCUIElementTypeButton",
                    "attrs": {"visible":"true", "enabled":"true", "width": "%s" % ScreenX, "height": "%s" % ScreenY}
                },
                {
                    "tag": "XCUIElementTypeStaticText",
                    "attrs": {"visible":"true", "enabled":"true", "value":"可能离开微信,打开第三方应用"}
                },
            ]
        output:
            soup node of 
                <XCUIElementTypeStaticText type="XCUIElementTypeStaticText" value="可能离开微信,打开第三方应用" name="可能离开微信,打开第三方应用" label="可能离开微信,打开第三方应用" enabled="true" visible="true" x="71" y="331" width="272" height="18"/>
                in :
                <XCUIElementTypeWindow type="XCUIElementTypeWindow" enabled="true" visible="true" x="0" y="0" width="414" height="736">
                    <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="0" width="414" height="736">
                        <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="0" width="414" height="736">
                            <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="0" width="414" height="736">
                                <XCUIElementTypeButton type="XCUIElementTypeButton" enabled="true" visible="true" x="0" y="0" width="414" height="736">
                                    <XCUIElementTypeStaticText type="XCUIElementTypeStaticText" enabled="true" visible="false" x="47" y="288" width="0" height="0"/>
                                    <XCUIElementTypeStaticText type="XCUIElementTypeStaticText" value="可能离开微信,打开第三方应用" name="可能离开微信,打开第三方应用" label="可能离开微信,打开第三方应用" enabled="true" visible="true" x="71" y="331" width="272" height="18"/>
                                    <XCUIElementTypeStaticText type="XCUIElementTypeStaticText" value="取消" name="取消" label="取消" enabled="true" visible="true" x="109" y="409" width="36" height="22"/>
                                    <XCUIElementTypeStaticText type="XCUIElementTypeStaticText" value="继续" name="继续" label="继续" enabled="true" visible="true" x="269" y="409" width="36" height="22"/>
                                </XCUIElementTypeButton>
                            </XCUIElementTypeOther>
                        </XCUIElementTypeOther>
                    </XCUIElementTypeOther>
                </XCUIElementTypeWindow>
    """
    foundSoup = None
    if queryChainList:
        chainListLen = len(queryChainList)

        if chainListLen == 1:
            # last one
            curLevelFindDict = queryChainList[0]
            curTag = curLevelFindDict["tag"]
            curAttrs = curLevelFindDict["attrs"]
            foundSoup = curLevelSoup.find(curTag, attrs=curAttrs)
        else:
            highestLevelFindDict = queryChainList[0]
            curTag = highestLevelFindDict["tag"]
            curAttrs = highestLevelFindDict["attrs"]
            foundSoupList = curLevelSoup.find_all(curTag, attrs=curAttrs)
            if foundSoupList:
                childrenChainList = queryChainList[1:]
                for eachSoup in foundSoupList:
                    eachSoupResult = CommonUtils.bsChainFind(eachSoup, childrenChainList)
                    if eachSoupResult:
                        foundSoup = eachSoupResult
                        break

    return foundSoup

举例:

(1)

"""
    微信-小程序 弹框 警告 尚未进行授权:
        <XCUIElementTypeButton type="XCUIElementTypeButton" enabled="true" visible="true" x="0" y="0" width="375" height="667">
            <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="32" y="240" width="311" height="187">
                <XCUIElementTypeImage type="XCUIElementTypeImage" enabled="true" visible="false" x="32" y="240" width="311" height="187"/>
                <XCUIElementTypeStaticText type="XCUIElementTypeStaticText" value="警告" name="警告" label="警告" enabled="true" visible="true" x="52" y="265" width="271" height="21"/>
                <XCUIElementTypeTextView type="XCUIElementTypeTextView" value="尚未进行授权,请点击确定跳转到授权页面进行授权。" enabled="true" visible="true" x="60" y="300" width="255" height="57"/>
                <XCUIElementTypeButton type="XCUIElementTypeButton" name="取消" label="取消" enabled="true" visible="true" x="32" y="376" width="156" height="51"/>
                <XCUIElementTypeButton type="XCUIElementTypeButton" name="确定" label="确定" enabled="true" visible="true" x="187" y="376" width="156" height="51"/>
            </XCUIElementTypeOther>
        </XCUIElementTypeButton>
"""
warningChainList = [
    {
        "tag": "XCUIElementTypeButton",
        "attrs": {"visible":"true", "enabled":"true", "width": "%s" % self.X, "height": "%s" % self.totalY}
    },
    {
        "tag": "XCUIElementTypeOther",
        "attrs": {"visible":"true", "enabled":"true"}
    },
    {
        "tag": "XCUIElementTypeStaticText",
        "attrs": {"visible":"true", "enabled":"true", "value":"警告"}
    },
]
warningSoup = CommonUtils.bsChainFind(soup, warningChainList)

相关:

找到元素后,再去点击:

if warningSoup:
    parentOtherSoup = warningSoup.parent
    confirmSoup = parentOtherSoup.find(
        "XCUIElementTypeButton",
        attrs={"visible":"true", "enabled":"true", "name": "确定"}
    )
    if confirmSoup:
        self.clickElementCenterPosition(confirmSoup)
        foundAndProcessedPopup = True

(2)

"""
    系统弹框 拍照或录像:
        <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="8" y="530" width="398" height="133">
            <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="8" y="530" width="398" height="133">
                <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="8" y="530" width="398" height="133">
                    <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="false" x="0" y="0" width="398" height="133">
                        <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="0" width="398" height="133">
                            <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="0" width="398" height="133">
                                <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="0" width="398" height="133">
                                    <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="0" width="398" height="133">
                                        <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="0" width="398" height="133">
                                            <XCUIElementTypeTable type="XCUIElementTypeTable" enabled="true" visible="true" x="0" y="0" width="398" height="133">
                                                <XCUIElementTypeCell type="XCUIElementTypeCell" enabled="true" visible="true" x="0" y="0" width="398" height="45">
                                                    <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="43" width="398" height="2"/>
                                                    <XCUIElementTypeImage type="XCUIElementTypeImage" enabled="true" visible="false" x="13" y="6" width="1" height="32"/>
                                                    <XCUIElementTypeStaticText type="XCUIElementTypeStaticText" value="拍照或录像" name="拍照或录像" label="拍照或录像" enabled="true" visible="true" x="15" y="11" width="87" height="22"/>
                                                    <XCUIElementTypeImage type="XCUIElementTypeImage" enabled="true" visible="false" x="351" y="6" width="32" height="32"/>
                                                </XCUIElementTypeCell>
                                                <XCUIElementTypeCell type="XCUIElementTypeCell" enabled="true" visible="true" x="0" y="44" width="398" height="45">
                                                    <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="88" width="398" height="1"/>
                                                    <XCUIElementTypeImage type="XCUIElementTypeImage" enabled="true" visible="false" x="13" y="50" width="1" height="33"/>
                                                    <XCUIElementTypeStaticText type="XCUIElementTypeStaticText" value="照片图库" name="照片图库" label="照片图库" enabled="true" visible="true" x="15" y="56" width="70" height="21"/>
                                                    <XCUIElementTypeImage type="XCUIElementTypeImage" enabled="true" visible="false" x="351" y="50" width="32" height="33"/>
                                                </XCUIElementTypeCell>
                                                <XCUIElementTypeCell type="XCUIElementTypeCell" enabled="true" visible="true" x="0" y="88" width="398" height="45">
                                                    <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="false" x="0" y="132" width="398" height="1"/>
                                                    <XCUIElementTypeImage type="XCUIElementTypeImage" enabled="true" visible="false" x="13" y="94" width="1" height="33"/>
                                                    <XCUIElementTypeStaticText type="XCUIElementTypeStaticText" value="浏览" name="浏览" label="浏览" enabled="true" visible="true" x="14" y="100" width="36" height="21"/>
                                                    <XCUIElementTypeImage type="XCUIElementTypeImage" name="UIDocumentPicker-more" enabled="true" visible="false" x="351" y="94" width="32" height="33"/>
                                                </XCUIElementTypeCell>
                                                <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="132" width="398" height="1"/>
                                            </XCUIElementTypeTable>
                                        </XCUIElementTypeOther>
                                    </XCUIElementTypeOther>
                                </XCUIElementTypeOther>
                            </XCUIElementTypeOther>
                        </XCUIElementTypeOther>
                    </XCUIElementTypeOther>
                </XCUIElementTypeOther>
            </XCUIElementTypeOther>
        </XCUIElementTypeOther>
        <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="8" y="671" width="398" height="57">
            <XCUIElementTypeButton type="XCUIElementTypeButton" name="取消" label="取消" enabled="true" visible="true" x="8" y="671" width="398" height="57"/>
        </XCUIElementTypeOther>
"""
photoCameraChainList = [
    {
        "tag": "XCUIElementTypeOther",
        "attrs": {"enabled":"true", "visible":"true"}
    },
    {
        "tag": "XCUIElementTypeTable",
        "attrs": {"enabled":"true", "visible":"true", "x":"0", "y":"0"}
    },
    {
        "tag": "XCUIElementTypeStaticText",
        "attrs": {"enabled":"true", "visible":"true", "value":"拍照或录像"}
    },
]
photoCameraSoup = CommonUtils.bsChainFind(soup, photoCameraChainList)

(3)iOS 设置 无线局域网 列表页 找 当前已连接的WiFi,特征是带蓝色✅的:

"""
    设置 无线局域网 列表页:
        <XCUIElementTypeTable type="XCUIElementTypeTable" enabled="true" visible="true" x="0" y="0" width="414" height="736">
        。。。
            <XCUIElementTypeCell type="XCUIElementTypeCell" name=“xxx_guest_5G, 安全网络, 信号强度 3 格,共 3 格" label=“xxx_guest_5G, 安全网络, 信号强度 3 格,共 3 格" enabled="true" visible="true" x="0" y="144" width="414" height="43">
                <XCUIElementTypeStaticText type="XCUIElementTypeStaticText" value=“xxx_guest_5G" name=“xxx_guest_5G" label=“xxx_guest_5G" enabled="true" visible="true" x="40" y="155" width="278" height="21"/>
                <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="0" y="186" width="414" height="1"/>
                <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="true" x="8" y="151" width="28" height="29">
                    <XCUIElementTypeImage type="XCUIElementTypeImage" name="UIPreferencesBlueCheck" enabled="true" visible="false" x="8" y="151" width="28" height="29"/>
                </XCUIElementTypeOther>
                <XCUIElementTypeOther type="XCUIElementTypeOther" enabled="true" visible="false" x="15" y="187" width="245" height="1"/>
                <XCUIElementTypeImage type="XCUIElementTypeImage" name="Lock" enabled="true" visible="false" x="326" y="159" width="8" height="12"/>
                <XCUIElementTypeImage type="XCUIElementTypeImage" name="WifiBars3" enabled="true" visible="false" x="346" y="153" width="16" height="25"/>
                <XCUIElementTypeButton type="XCUIElementTypeButton" name="更多信息" label="更多信息" enabled="true" visible="true" x="372" y="154" width="22" height="22"/>
            </XCUIElementTypeCell>
"""
curPageXml = self.get_page_source()
soup = CommonUtils.xmlToSoup(curPageXml)
blueCheckChainList = [
    {
        "tag": "XCUIElementTypeCell",
        "attrs": {"enabled":"true", "visible":"true", "x":"0", "width":"%s" % self.X}
    },
    {
        "tag": "XCUIElementTypeOther",
        "attrs": {"enabled":"true", "visible":"true"}
    },
    {
        "tag": "XCUIElementTypeImage",
        # "attrs": {"enabled":"true", "visible":"true", "name": "UIPreferencesBlueCheck"}
        "attrs": {"enabled":"true", "name": "UIPreferencesBlueCheck"}
    },
]
blueCheckSoup = CommonUtils.bsChainFind(soup, blueCheckChainList)
if blueCheckSoup:

results matching ""

    No results matching ""