将Sentence Case应用于可能包含HTML的字符串

时间:2016-04-09 01:41:02

标签: javascript vbscript asp-classic

我有一系列字符串需要转换为“句子”,但更复杂的是字符串可能有html锚标签。

可能包含这样的html:

<a href="/foo">foo</a> is a word. this is another word, and <a href="/bar">bar</a> is another.

我想将句子与输出一起应用如下:

<a href="/foo">Foo</a> is a word. This is another word, and <a href="/bar">bar</a> is another.

我可以使用任何使用jsvbscript的解决方案。

2 个答案:

答案 0 :(得分:2)

我认为你可以构建一个相当天真的方法,简单地遍历字符串并根据它遇到的内容标记条件(即它设置一个inHtml标志以指示它在HTML标记中并设置另一个shouldCapitalize标记以确定它是否在句子的开头:

function titleCaseHtmlSentence(s){
    // A temporary string to hold your results
    var result = '';
    // Iterate through the sentence and check each character to determine if 
    // it is the start of a sentence, ignore this
    var shouldCapitalize = true;
    var inHtml = false;
    for(var i = 0; i < s.length; i++){
        // If this is any non tag, punctuation or letter or we are in HTML
        // and haven't encountered a closing tag
        if(/[^a-zA-Z\?\.\>\\<\!]/.test(s[i]) || (inHtml && s[i] != '>')){
            result += s[i];
            continue;   
        }
        // If we should capitalize, check if we can
        if(shouldCapitalize && /[a-zA-Z]/.test(s[i])){
            // Capitalize this character
            result += s[i].toUpperCase();
            shouldCapitalize = false;
            continue;
        }
        else{
            result += s[i];
            // If this character is '<', then we are in HTML, so ignore these
            if(s[i] == '<'){
                inHtml = true;
                continue;
            }
            // If the character is a closing tag '>', then start paying attention again
            if(s[i] == '>'){
                inHtml = false;
                continue;
            }

            // Determine if we hit punctuation to start a new sentence
            if(/[\?\!\.]/.test(s[i])){
                shouldCapitalize = true;
                continue;
            }
        }
    }
    return result;
}

我把它扔得很匆匆,所以我确信它在任何意义上都不是最优的,但它应该像seen in this example那样工作。

答案 1 :(得分:1)

如果有人正在寻找,这里有一个Rion Williams逻辑到vbScript函数的端口。我使用了我自己的类库中的一些函数,所以也包括那些所需的部分,以供参考。

正如Rion所说,这只是一个开始,需要进行大量的微调。

Function toSentenceCase(byVal x)
    Dim i, r, s, bCapitalize, bInHtml

    bCapitalize = True
    bInHtml = False

    Set r = New regularExpression
    Set s = New adoStream

    For i = 1 To Len(x)
        sChar = Mid(x, i, 1)
        Do
            'If this is any non tag, punctuation or letter or we are in HTML and haven't encountered a closing tag
            If r.test("[^a-zA-Z\?\.\>\\<\!]", sChar) Or (bInHtml And sChar <> ">") Then 
                s sChar
                Exit Do
            End If

            'if we should capitalize, check if we can, and if yes, then capitalize
            If bCapitalize And r.test("[a-zA-Z]", sChar) Then 
                s uCase(sChar)
                bCapitalize = False
                Exit Do
            Else 
                s sChar

                'if this character is '<', then we are in HTML, so ignore these
                If sChar = "<" Then 
                    bInHtml = True
                    Exit Do
                End If

                'if the character is a closing tag '>', then start paying attention again
                If sChar = ">" Then
                    bInHtml = False
                    Exit Do
                End If

                'determine if we hit punctuation to start a new sentence
                If r.test("[\?\!\.]", sChar) Then
                    bCapitalize = True
                    Exit Do
                End If  

            End If 

        Loop While False
    Next

    toSentenceCase = s.Value
End Function 

Class adoStream
    'string builder class. adodb streams are way faster than appending to/editing content of string variables
    Private stream

    Private Sub Class_Initialize()                  
        Set stream = CreateObject("ADODB.Stream")
        stream.Type = 2 '2 = text stream
        stream.Open
    End Sub

    Private Sub Class_Terminate()
        stream.Close
        Set stream = Nothing
    End Sub

    Public Default Sub Add(byVal addString) 'add string to existing stream
        stream.WriteText addString
    End Sub

    Public Sub Update(byVal addString) 'update existing stream and set it to a new value. clear existing stream and set it = new value
        Clear
        stream.WriteText addString
    End Sub

    Public Property Get Value 'returns full stream
        stream.Position = 0
        Value = stream.ReadText()
    End Property

    Public Function Clear() 'resets stream
        stream.Position = 0
        Call stream.SetEOS()
    End Function        
End Class


Class regularExpression
    'class containing a set of vbscript regex routines
    Private oRegex
    Private Sub Class_Initialize()                  
        Set oRegex = New RegExp
        oRegex.Global = True    'e.g. findall
        oRegex.IgnoreCase = True
    End Sub

    Private Sub Class_Terminate()
        Set oRegex = Nothing
    End Sub

    'test
    Public Function test(byVal sPattern, byVal sTestString) 'return t/f
        If isNull(sTestString) Then 
            test = False
            Exit Function
        End If
        oRegex.Pattern = sPattern
        test = oRegex.test(sTestString)
    End Function
End Class