如何通过考虑前一个和下一个单词来精确匹配字符串

时间:2017-02-18 08:06:46

标签: java regex xml string

我是java regex的新手。所以,我有一个包含不同节点的xml文件。文件是 -

<Node id="855"/>PROFILE<Node id="862"/>:<Node id="863"/>
<Node id="864"/>8<Node id="865"/> <Node id="866"/>years<Node id="871"/> <Node id="872"/>IT<Node id="874"/> <Node id="875"/>industry<Node id="883"/> <Node id="884"/>experience<Node id="894"/> <Node id="895"/>in<Node id="897"/> <Node id="898"/>web<Node id="901"/> <Node id="902"/>based<Node id="907"/> <Node id="908"/>applications<Node id="920"/> <Node id="921"/>that<Node id="925"/> <Node id="926"/>involved<Node id="934"/> <Node id="935"/>extensive<Node id="944"/> <Node id="945"/>development<Node id="956"/> <Node id="957"/>work<Node id="961"/> <Node id="962"/>in<Node id="964"/> <Node id="965"/>Java<Node id="969"/>/<Node id="970"/>J<Node id="971"/>2<Node id="972"/>EE<Node id="974"/>,<Node id="975"/>Jquery<Node id="981"/>,<Node id="982"/>Jqgrid<Node id="988"/>,<Node id="989"/>Ajax<Node id="993"/>.<Node id="994"/>
<Node id="995"/>Good<Node id="999"/> <Node id="1000"/>experience<Node id="1010"/> <Node id="1011"/>in<Node id="1013"/> <Node id="1014"/>agile<Node id="1019"/> <Node id="1020"/>methodology<Node id="1031"/> <Node id="1032"/>.<Node id="1033"/>

我有一个字符串,我需要与此字符串匹配。

PROFILE:
8 years IT industry experience in web based applications that involved extensive development work in Java/J2EE,Jquery,Jqgrid,Ajax.

所以,

private void parseXml(ArrayList<String> elements, String filePath) {
    boolean flag = false;
    String nextId = "0";
    String xmlData = getTextWithNodesDataFromXml(filePath);
    for (String s : elements) {
        System.out.println(s);
        String token;
        int id;
        String regex = "";
        if (flag == false) {
            regex = "<Node id=\"([0-9]+)\"\\/>(" + s + ")";
            flag = true;
            Pattern pattern1 = Pattern.compile(regex);
            Matcher matcher1 = pattern1.matcher(xmlData);
            if (matcher1.find()) {
                System.out.println("match found -->" + s);
            }
        }

SO第一个参数是一个数组列表,其中包含要匹配的字符串的标记,第二个是文件的路径。 xmlData是我之前提到的节点,我需要与之匹配。所以,如果我发现PROFILE匹配三次,那么如何检查整个字符串?我必须匹配这个节点的确切字符串?我怎样才能做到这一点?

2 个答案:

答案 0 :(得分:0)

我建议比较可比较的东西:你展示的XML是句子的标记化。您尝试将其与整个String进行比较。

如果将XML转换为String数组并用recyclerView.addOnItemTouchListener( new RecyclerItemClickListener(context, new RecyclerItemClickListener.OnItemClickListener() { @Override public void onItemClick(View view, int position) { // TODO Handle item click } }) ); 标记句子PROFILE,则必须比较两个String数组。

也许,你不想要完全匹配?

在这种情况下,您必须计算相似性的百分比String.split("\\s+"),确定一个阈值p并仅保留PROFILE t

答案 1 :(得分:0)

我对所要求的内容做了一些假设,并创建了一个可行的解决方案,因为我可以最好地理解它。

我因避免阅读文件而作弊。

package stacktest;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class StringMatch 
{
    private String getTextWithNodesDataFromXml(String filePath)
    {
        // cheating here by passing the xml as a string in.
        return filePath;
    }

    private int []  findMatches(List<String> elements, String filePath) 
    {
        String xmlData = getTextWithNodesDataFromXml(filePath);

        String outerRegex = "(<Node id=\"[0-9]+\"\\/>PROFILE<Node id=\"[0-9]+\"\\/>:<)";
        Pattern outerPattern = Pattern.compile(outerRegex, Pattern.DOTALL);
        Matcher outerMatcher = outerPattern.matcher(xmlData);
        int outerMatches = 0;
        boolean first = true;
        int lastStart=0;
        ArrayList<String> profiles = new ArrayList<String>();
        while (outerMatcher.find())
        {
            String localXML = outerMatcher.group(1);
            int startIndex = outerMatcher.start(1);

            if (!first)
            {
                localXML = xmlData.substring(lastStart, startIndex);
                profiles.add(localXML);
            }
            lastStart = startIndex;
            first = false;
            outerMatches++;
        }
        // Is there a hanging one at the end?
        if (outerMatches > 0)
        {
            String localXML = xmlData.substring(lastStart);
            profiles.add(localXML);
        }

        for (String profile: profiles)
        {
            // System.out.println(localXML);
            String regex = "<Node id=\"([0-9]+)\"\\/>([^<]+)";

            Pattern pattern1 = Pattern.compile(regex);
            Matcher matcher1 = pattern1.matcher(profile);
            ArrayList<String> toMatch = new ArrayList<String>();
            ArrayList<String> idMatch = new ArrayList<String>();

            while (matcher1.find())
            {
                String token = matcher1.group(2);
                toMatch.add(token);
                String id = matcher1.group(1);
                idMatch.add(id);
                outerMatches++;
            }

            if (elements.size() == toMatch.size())
            {
                boolean didFind = true;
                for (int i=0; i< elements.size(); i++) 
                {
                    String element = elements.get(i);
                    String match = toMatch.get(i);
                    if (!element.equals(match))
                    {
                        didFind = false;
                    }
                }

                if (didFind)
                {
                    int[] toReturn = new int[2];
                    toReturn[0] = Integer.parseInt(idMatch.get(0));
                    toReturn[1] = Integer.parseInt(idMatch.get(idMatch.size()-1));
                    return toReturn;
                }
            }

        }

        return null;
     }

    public static void main(String args[])
    {
        String nodes = "<Node id=\"855\"/>PROFILE<Node id=\"862\"/>:<Node id=\"863\"/>\n" +
                "<Node id=\"864\"/>8<Node id=\"865\"/> <Node id=\"866\"/>years<Node id=\"871\"/> <Node id=\"872\"/>IT<Node id=\"874\"/> <Node id=\"875\"/>industry<Node id=\"883\"/> <Node id=\"884\"/>experience<Node id=\"894\"/> <Node id=\"895\"/>in<Node id=\"897\"/> <Node id=\"898\"/>web<Node id=\"901\"/> <Node id=\"902\"/>based<Node id=\"907\"/> <Node id=\"908\"/>applications<Node id=\"920\"/> <Node id=\"921\"/>that<Node id=\"925\"/> <Node id=\"926\"/>involved<Node id=\"934\"/> <Node id=\"935\"/>extensive<Node id=\"944\"/> <Node id=\"945\"/>development<Node id=\"956\"/> <Node id=\"957\"/>work<Node id=\"961\"/> <Node id=\"962\"/>in<Node id=\"964\"/> <Node id=\"965\"/>Java<Node id=\"969\"/>/<Node id=\"970\"/>J<Node id=\"971\"/>2<Node id=\"972\"/>EE<Node id=\"974\"/>,<Node id=\"975\"/>Jquery<Node id=\"981\"/>,<Node id=\"982\"/>Jqgrid<Node id=\"988\"/>,<Node id=\"989\"/>Ajax<Node id=\"993\"/>.<Node id=\"994\"/>\n" +
                "<Node id=\"995\"/>Good<Node id=\"999\"/> <Node id=\"1000\"/>experience<Node id=\"1010\"/>";

        String nodes2 = "<Node id=\"855\"/>PROFILE<Node id=\"862\"/>:<Node id=\"863\"/>\n" +
                "<Node id=\"864\"/>8<Node id=\"865\"/> <Node id=\"866\"/>years<Node id=\"871\"/> <Node id=\"872\"/>IT<Node id=\"874\"/> <Node id=\"875\"/>industry<Node id=\"883\"/> <Node id=\"884\"/>experience<Node id=\"894\"/> <Node id=\"895\"/>in<Node id=\"897\"/> <Node id=\"898\"/>web<Node id=\"901\"/> <Node id=\"902\"/>based<Node id=\"907\"/> <Node id=\"908\"/>applications<Node id=\"920\"/> <Node id=\"921\"/>that<Node id=\"925\"/> <Node id=\"926\"/>involved<Node id=\"934\"/> <Node id=\"935\"/>extensive<Node id=\"944\"/> <Node id=\"945\"/>development<Node id=\"956\"/> <Node id=\"957\"/>work<Node id=\"961\"/> <Node id=\"962\"/>in<Node id=\"964\"/> <Node id=\"965\"/>Java<Node id=\"969\"/>/<Node id=\"970\"/>J<Node id=\"971\"/>2<Node id=\"972\"/>EE<Node id=\"974\"/>,<Node id=\"975\"/>Jquery<Node id=\"981\"/>,<Node id=\"982\"/>Jqgrid<Node id=\"988\"/>,<Node id=\"989\"/>Ajax<Node id=\"993\"/>.<Node id=\"994\"/>";
                //"<Node id=\"995\"/>Good<Node id=\"999\"/> <Node id=\"1000\"/>experience<Node id=\"1010\"/>";

        String nodes3 = "<Node id=\"1\"/>PROFILE<Node id=\"2\"/>:<Node id=\"3\"/>This<Node id=\"4\"/>is<Node id=\"5\"/>not<Node id=\"6\"/>the<Node id=\"7\"/>Profile<Node id=\"8\"/>\n" +
                "<Node id=\"855\"/>PROFILE<Node id=\"862\"/>:<Node id=\"863\"/>\n" +
                "<Node id=\"864\"/>8<Node id=\"865\"/> <Node id=\"866\"/>years<Node id=\"871\"/> <Node id=\"872\"/>IT<Node id=\"874\"/> <Node id=\"875\"/>industry<Node id=\"883\"/> <Node id=\"884\"/>experience<Node id=\"894\"/> <Node id=\"895\"/>in<Node id=\"897\"/> <Node id=\"898\"/>web<Node id=\"901\"/> <Node id=\"902\"/>based<Node id=\"907\"/> <Node id=\"908\"/>applications<Node id=\"920\"/> <Node id=\"921\"/>that<Node id=\"925\"/> <Node id=\"926\"/>involved<Node id=\"934\"/> <Node id=\"935\"/>extensive<Node id=\"944\"/> <Node id=\"945\"/>development<Node id=\"956\"/> <Node id=\"957\"/>work<Node id=\"961\"/> <Node id=\"962\"/>in<Node id=\"964\"/> <Node id=\"965\"/>Java<Node id=\"969\"/>/<Node id=\"970\"/>J<Node id=\"971\"/>2<Node id=\"972\"/>EE<Node id=\"974\"/>,<Node id=\"975\"/>Jquery<Node id=\"981\"/>,<Node id=\"982\"/>Jqgrid<Node id=\"988\"/>,<Node id=\"989\"/>Ajax<Node id=\"993\"/>.<Node id=\"994\"/>" +
                "PROFILE<Node id=\"1021\"/>:<Node id=\"1022\"/>This<Node id=\"1023\"/>is<Node id=\"1024\"/>not<Node id=\"1025\"/>the<Node id=\"1026\"/>Profile<Node id=\"1027\"/>\n";

        String[] el = { "PROFILE", ":", "\n",
                        "8", " ", "years", " ", "IT", " ", "industry", " ", "experience", " ", "in", " ", "web",
                        " ", "based", " ", "applications", " ", "that", " ", "involved", " ", "extensive", " ", 
                        "development", " ", "work", " ", "in", " ", "Java", "/", "J", "2", "EE", ",", "Jquery", 
                        ",", "Jqgrid", ",", "Ajax", "."
                        };

        List<String> elements =  Arrays.asList(el);

        StringMatch sm = new StringMatch();
        printTest(sm.findMatches(elements, nodes)); 
        printTest(sm.findMatches(elements, nodes2)); 
        printTest(sm.findMatches(elements, nodes3)); 
    }

    private static void printTest(int[] vals)
    {
        if (vals != null) 
        {
            System.out.println("found match from id: " + vals[0] + " to " + vals[1]);
        }
        else
        {
            System.out.println("no match");
        }
        System.out.println("--------------------------------");
    }

}

该方法有三次测试调用,他们返回:

no match
--------------------------------
found match from id: 855 to 993
--------------------------------
found match from id: 855 to 993
--------------------------------