根据标记自定义编码和解码字符串

时间:2018-04-04 09:23:07

标签: java encoding

我正在研究直方图的字符串表示的自定义编码和解码。

String包含有关开始时直方图的信息,然后进入bin的标记和数字表示。令牌表示下一个“运行”箱中箱高度和长度的长度的细节。代币是A,B,C,D ......等。

下面的代码适用于许多直方图,但我发现了一些破坏代码的异常。我不知道为什么,如果编码或解码错误,我无法解决。所以我会发布两者。

下面有很多代码,如果您发现任何明显的错误或可能的错误,请告诉我。

我遇到的一个问题是,我无法分辨这是导致问题的编码或解码。关于如何区分这一点的任何想法都会有所帮助。

我已经建立了很多单元测试,除此之外,所有单元测试都通过了。

首先将字符串编码为字节表示的代码它基本上通过获取令牌值,获取字符串的下N个数字并将其转换为字节来实现:

/** takes in a rawSPEC and returns a byte array to be encoded
 * @param rawSPEC raw SPEC
 * @return byte array
 */
public static byte[] encodeRawSPEC(String rawSPEC){

    //work out the size in advance.
    List<Byte> byteList = new LinkedList<>();

    //loop through the raw string
    int length = 0;
    int height = 0;
    for(int i = 0; i < rawSPEC.length(); i++){
        //read the token from the string
        char token = rawSPEC.charAt(i);

        byteList.add((byte)(token & 0xFF));

        //find the length and height in bytes of the valid token
        boolean nonSpecial = false;
        for(Token t : Token.values()){
            if(token == t.name().charAt(0)){
                nonSpecial = true;
                height = t.getHeight();
                length = t.getLength();
            }
        }

        if(nonSpecial) {
            //read the height and length values
            String tokenLength;
            String tokenHeight;
            if (length != 0 && length != 1) {
                if (length == 8) {
                    //1 byte. just convert to byte and add
                    tokenLength = rawSPEC.substring(i + 1, i + 4);
                    i += 3;
                    byteList.add((byte)(Integer.parseInt(tokenLength)));
                } else if (length == 16) {
                    //2 bytes. convert to byte array and add each sequencially
                    tokenLength = rawSPEC.substring(i + 1, i + 6);
                    i += 5;
                    byte[] byt = SPECTools.encode2BitStringDecimal(Integer.parseInt(tokenLength));
                    for(byte b : byt) byteList.add(b);
                } else {
                    //4 bytes. convert to byte array and add each sequencially
                    tokenLength = rawSPEC.substring(i + 1, i + 11);
                    i += 10;
                    byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(tokenLength));
                    for(byte b : byt) byteList.add(b);
                }

            }

                //encode any heights
            if (height != 0 && height != 1) {
                if (height == 8) {
                    tokenHeight = rawSPEC.substring(i + 1, i + 4);
                    i += 3;
                    byteList.add((byte)(Integer.parseInt(tokenHeight)));
                } else if (height == 16) {
                    tokenHeight = rawSPEC.substring(i + 1, i + 6);
                    i += 5;
                    byte[] byt = SPECTools.encode2BitStringDecimal(Integer.parseInt(tokenHeight));
                    for(byte b : byt) byteList.add(b);
                } else {
                    tokenHeight = rawSPEC.substring(i + 1, i + 11);
                    i += 10;
                    byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(tokenHeight));
                    for(byte b : byt) byteList.add(b);
                }

            }

        }else if(token == 'R'){
            String numReads = rawSPEC.substring(i+1, i+4);
            byteList.add((byte)(Integer.parseInt(numReads)));
            i+=3;
            int reads = Integer.parseInt(numReads);
            for(int j = 0; j < reads; j++){
                String h = rawSPEC.substring(i+1, i+4);
                i+=3;
                byteList.add((byte)(Integer.parseInt(h)));
            }
        }else if(token == 'S'){
            String numReads = rawSPEC.substring(i+1, i+4);
            int a = Integer.parseInt(numReads);
            byteList.add((byte)(Integer.parseInt(numReads)));

            i+=3;
            int reads = Integer.parseInt(numReads);
            for(int j = 0; j < reads; j++){
                String h = rawSPEC.substring(i+1, i+6);
                i+=5;
                byte[] byt = SPECTools.encode2BitStringDecimal(Integer.parseInt(h));
                for(byte b : byt) byteList.add(b);
            }
        }else if(token == 'T') {
            String h = rawSPEC.substring(i + 1, i + 11);
            i += 10;
            byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(h));
            for (byte b : byt) byteList.add(b);
            h = rawSPEC.substring(i + 1, i + 11);
            i += 10;
            byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(h));
            for (byte b : byt) byteList.add(b);
        }else if(token == 'U'){
            String h = rawSPEC.substring(i + 1, i + 11);
            i += 10;
            byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(h));
            for (byte b : byt) byteList.add(b);
        }else if(token == 'V'){
            for(int j = i; j < rawSPEC.length(); j++){
                if(rawSPEC.charAt(j) == '0' && rawSPEC.charAt(j+1) == '0' && rawSPEC.charAt(j+1) == '0'){
                    String h = rawSPEC.substring(i+1, j);
                    for(char c : h.toCharArray()){
                        byteList.add((byte)(c & 0xFF));
                    }
                    i += h.length();

                    byteList.add((byte)0);
                    i+=3;


                    break;
                }
            }
        }else if(token == 'W'){
            //encode the 6 escape characters
            for(int j = 0; j < 6; j ++){
                String numReads = rawSPEC.substring(i+1, i+4);
                byteList.add((byte)(Integer.parseInt(numReads)));
                i+=3;
            }

            //encode the 4 Bit number
            String number = rawSPEC.substring(i + 1, i + 11);
            i += 10;
            byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(number));
            for(byte b : byt) byteList.add(b);

        }else if(token == 'X'){
            //code
            String code = rawSPEC.substring(i+1, i+4);
            byteList.add((byte)(Integer.parseInt(code)));
            i+=3;

            //statement
            String statementLength = rawSPEC.substring(i+1, i+4);
            int leng = Integer.parseInt(statementLength);
            i+=3;
            byteList.add((byte)(leng));
            for(int j = i+1; j < i+1 + leng; j++){
                byteList.add((byte)rawSPEC.charAt(j));
            }
            i+=leng;

            //extension
            String extension = rawSPEC.substring(i+1, i+6);
            byte[] b = SPECTools.encode2BitStringDecimal(Integer.parseInt(extension));
            for(byte by: b ){
                byteList.add(by);
            }
            i+=5;
            //endseq
            int endLength = (Integer.parseInt(rawSPEC.substring(i+1, i+4)));
            byteList.add((byte) endLength);
            i+=3;
            if(endLength != 0){
                for(int j = i+1; j < i+1 + endLength; j++){
                    byteList.add((byte)rawSPEC.charAt(j));
                }
                i+=endLength;
            }

            //endseq
            int flankLength = (Integer.parseInt(rawSPEC.substring(i+1, i+4)));
            byteList.add((byte) flankLength);
            i+=3;
            if(flankLength != 0){
                for(int j = i+1; j < i+1 + flankLength; j++){
                    byteList.add((byte)rawSPEC.charAt(j));
                }
                i+=flankLength;
            }


        }else if(token == 'Y'){
            //must be Y
            String Ybit = rawSPEC.substring(i + 1, i + 11);
            i += 10;
            byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(Ybit));
            for(byte b : byt) byteList.add(b);
        }
    }

    //convert to array
    byte[] byteArray = new byte[byteList.size()];
    for(int i = 0; i < byteList.size(); i++) byteArray[i] = byteList.get(i);

    return byteArray;

}

其次,解码代码。以另一种方式工作,抓住令牌。获取适当的字节数,然后将字节转换为整数并附加到string:

public static char[] decodeBinarySPECtoRAW(byte[] bytes){
    StringBuilder sb = new StringBuilder();

        int height = 0;
        int length = 0;
        int histogramLength = 0;
            for (int i = 0; i < bytes.length; i++) {
                char token = (char) bytes[i];

                sb.append(token);
                boolean nonSpecial = false;
                for (Token t : Token.values()) {
                    if (token == t.name().charAt(0)) {
                        nonSpecial = true;
                        height = t.getHeight();
                        length = t.getLength();
                    }
                }

                if (nonSpecial) {
                    //length
                    if (length != 0 && length != 1) {
                        if (length == 8) {
                            //1 byte
                            sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
                            histogramLength+=bytes[i+1] & 0xFF;
                            i++;
                        } else if (length == 16) {
                            //2 bytes
                            histogramLength+= SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2]);
                            sb.append(String.format("%05d", SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2])));
                            i += 2;
                        } else {
                            //4 bytes
                            histogramLength += SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4]);
                            sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
                            i += 4;
                        }
                    }else{
                        histogramLength++;
                    }

                    //height
                    if (height != 0 && height != 1) {
                        if (height == 8) {
                            //1 byte
                            sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
                            i++;
                        } else if (height == 16) {
                            //2 bytes
                            sb.append(String.format("%05d", SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2])));
                            i += 2;
                        } else {
                            //4 bytes
                            sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
                            i += 4;
                        }
                    }
                } else {
                    switch (token) {
                        case 'R': {
                            int numReads = (int)bytes[i + 1];
                            i++;
                            sb.append(String.format("%03d", numReads & 0xFF));
                            for (int j = 0; j < numReads; j++) {
                                int nextNum = bytes[i + 1];
                                sb.append(String.format("%03d", nextNum & 0xFF));
                                histogramLength++;
                                i++;
                            }
                            break;
                        }
                        case 'S': {
                            int numReads = (int)bytes[i + 1];
                            i++;
                            sb.append(String.format("%03d", numReads & 0xFF));
                            for (int j = 0; j < numReads; j++) {
                                histogramLength++;

                                int in = SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2]);
                                sb.append(String.format("%05d", in));
                                i += 2;

                            }
                            break;
                        }
                        case 'T': {
                            sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
                            i += 4;
                            sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
                            i += 4;
                            break;
                        }
                        case 'U': {
                            sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
                            i += 4;
                            break;
                        }
                        case 'V': {
                            List<Byte> VBytes = new ArrayList<>();
                            boolean escapeFound = false;
                            while (!escapeFound) {
                                if( i + 1< bytes.length) {
                                    if ( bytes[i + 1] == 0) escapeFound = true;
                                    else {
                                        VBytes.add(bytes[i + 1]);
                                        i += 1;
                                    }
                                }
                            }
                            for (byte b : VBytes) {
                                sb.append((char) b);
                            }
                            sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
                            i += 1;
                            break;
                        }
                        case 'W': {
                            for (int j = 0; j < 6; j++) {
                                sb.append(String.format("%03d", bytes[i + 1]));
                                i += 1;
                            }
                            sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
                            i += 4;
                            break;
                        }
                        case 'X': {
                            sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
                            i += 1;
                            //get length of the statement
                            int statementLength = bytes[i + 1] & 0xFF;
                            sb.append(String.format("%03d", statementLength));
                            i += 1;
                            for (int j = i + 1; j < i + 1 + statementLength; j++) {
                                sb.append((char) bytes[j]);
                            }
                            i += statementLength;

                            sb.append(String.format("%05d", SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2])));
                            i += 2;
                            //endseq
                            int endLength = bytes[i + 1];
                            sb.append(String.format("%03d", endLength));
                            i += 1;
                            if (endLength != 0) {
                                for (int j = i + 1; j < i + 1 + endLength; j++) {
                                    sb.append((char) bytes[j]);
                                }
                                i += endLength;
                            }

                            //flankseq
                            int flankLength = bytes[i + 1];
                            sb.append(String.format("%03d", flankLength));
                            i += 1;
                            if (flankLength != 0) {
                                for (int j = i + 1; j < i + 1 + flankLength; j++) {
                                    sb.append((char) bytes[j]);
                                }
                                i += flankLength;
                            }
                            break;
                        }
                        case 'Y': {
                            //must be Y
                            sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
                            i += 4;
                            break;
                        }
                    }
                }

            }

    return sb.toString().toCharArray();
}

最后调用它的测试,这包括破坏代码的字符串。 AFAIK,直方图生成的字符串输入有效。

@Test
public void non_workingsample() throws Exception{

    String sample = "T00000000010000000001VTEL_Telomeric_repeat_Mus_musculus000X000008pu1.AE.100001000000W0000000000000000000000000101Y0000001236M00705I106I022I010I007I018I002I014I012EI004I020I006I011I004F002R028004007004008036012001017010007087005007010005008053001008003010005032000009003005007F002R019006002010004001000010006002004001000014004007003000001007J002004I007B002R010003001009007001003006002004006B002R010008000006005002000005006009010F002R007009004006005002000008J002005R053004002001005000006008044010006002004002039006009002012009046011012004009002046008013002010005067002012002007001042004010003011003040004006002011007025001014001J002006R017016004005003012006011000006002006005004001006001008J002003R009001014002006005000002007002J002005R038003002004002008011003001003001012004003000002003013004001000006000008006003001005002005010003002007000009003004002J002004R014005006001002008003006005130003006004006009B002R007001005009005001000006J002002I005B002R016018002007006002000009000005003000002003002004010F002R022004002004003001000007000007008001003006003004006002000005004011002F002R010005004008006000002004000004008J002002R004008002005003B002I006I003J002007R006000001005002011004B002R028004003014003001000009003007010003000006003010008000001006003012009000001008002012006F002I008AI007F003R022004001006011002000011004007005001000007003008004002000010001010006F002J002006I012I009B002R004006003015003J002002R016012004012002001002003002007002001000009005012010B002R014005000011006002000005002024008000006009002J002008R017004002009001009012003000005003004002003000010003005J002004S096000010000400003000060000900011000010000700002000070001700009000020001000002000050000800011000100000700006000370266000176000650021600022001050059100250000950002900091002720040900384002580024600104001490062400445001350006500185001730037500322002530004600149001710036700317001500004600104001790047300405001510005900090001640032100329001280003000073001580032100390001000003200076001070028600327013140004900103001380035300314002340004500208001830051000486002340000900003000180000100004J002002AI002EB002EB003F002B003I002B012S042000030000100002000070002401180006020016000658006480119001936003840008700339003650050200471002680004400234001750027301298002280010200226002780043100748010770024400997009950194303337000180001000003001410001100017B004I003I008M01730AI004I002I006F002AI004J002007R010013001003106006025022011001017B005S02500146000130016700336008750098100406000930000200454010770103600995002660000401095021900254000021000020000300047000080001500004J002002R010004005007003000003002007009005J002003R011006041025001000005024021013005004F002S19300011000030006700089001710046800302001190004600173002720041300314001220008400215003980056600636001460006600193003020044900399001240012300192002910051600427001450008700327003630064900523002040010900248003910068700605003500536300550007280137502166001800010300208009770054501811001290009400182002680051300382027310010200193002550050700388003330066000504006090123801354000060000200004000720000701363000060000000002000050000600998000020000000004000030001303781000070000000009000020001800015000040000800006000110002400009000050000100000000030000600807000630016100239004590051900283000740021600337005320080600300000880022200330006500076200363000870023600333006300077500349000930091600323005790082400391001160034500404008010227700833002490064200872019740071900287000850022400368005510074800302000780020600334006240071500313000640023500327007841768101020002910086701185023660031500012000030000500132000050100500002000030000100002000090000200005000000000100005000040629100006J002005I016I013M00268I008I005I008I010I021I007B002W0000000000000000000000472967S13900008000370001400498005110137801226005040015600006005010108901306004630015600002004610104801417004480011700006005810111601281004800013200003005230121601281004870011200001021010110601316004670016900003006400111401467129590012900005004960098904763004860014700005005540114201387004640012400007004880115801238004850015200006004560111701718005260014200004004810133403533004840015100003004230144104702004970014000006005260142401264005850011400004005190110601801004650013400005004560116401302004630012700010005880116501253004490011300008005850104110574004750010300007002750062405034014690007600006002270055200667001820007700004002450046500565002170006000001002240050404887002150008100002000310004500023J002010R014007008005006000007004003005009001004000001J002002R008000006002004003004000006B005I010B003EAI009EI002AF002R007005001000001002000005F002B003I006EB004I007EJ002002F002I005EB007EB002I007B005EB027I047B009M26433B004EB005Z0000081801B003M04952B002M02159000000000000000000";

    byte[] b = APUChunkEncoder.encodeRawSPEC(sample);

    //debugging
    for(byte d : b){
        System.out.println((char) d + " " + (int)d);
    }

    char[] result = APUChunkDecoder.decodeBinarySPECtoRAW(b);


    assertEquals(sample, String.valueOf(result));
}

测试结果:

T00000000010000000001VTEL_Telomeric_repeat_Mus_musculus000X000008pu1.AE.100001000000W0000000000000000000000000101Y0000001236M00705I106I022I010I007I018I002I014I012EI004I020I006I011I004F002R028004007004008036012001017010007087005007010005008053001008003010005032000009003005007F002R019006002010004001000010006002004001000014004007003000001007J002004I007B002R010003001009007001003006002004006B002R010008000006005002000005006009010F002R007009004006005002000008J002005R053004002001005000006008044010006002004002039006009002012009046011012004009002046008013002010005067002012002007001042004010003011003040004006002011007025001014001J002006R017016004005003012006011000006002006005004001006001008J002003R009001014002006005000002007002J002005R038003002004002008011003001003001012004003000002003013004001000006000008006003001005002005010003002007000009003004002J002004R014005006001002008003006005130003006004006009B002R007001005009005001000006J002002I005B002R016018002007006002000009000005003000002003002004010F002R022004002004003001000007000007008001003006003004006002000005004011002F002R010005004008006000002004000004008J002002R004008002005003B002I006I003J002007R006000001005002011004B002R028004003014003001000009003007010003000006003010008000001006003012009000001008002012006F002I008AI007F003R022004001006011002000011004007005001000007003008004002000010001010006F002J002006I012I009B002R004006003015003J002002R016012004012002001002003002007002001000009005012010B002R014005000011006002000005002024008000006009002J002008R017004002009001009012003000005003004002003000010003005J002004S096000010000400003000060000900011000010000700002000070001700009000020001000002000050000800011000100000700006000370266000176000650021600022001050059100250000950002900091002720040900384002580024600104001490062400445001350006500185001730037500322002530004600149001710036700317001500004600104001790047300405001510005900090001640032100329001280003000073001580032100390001000003200076001070028600327013140004900103001380035300314002340004500208001830051000486002340000900003000180000100004J002002AI002EB002EB003F002B003I002B012S042000030000100002000070002401180006020016000658006480119001936003840008700339003650050200471002680004400234001750027301298002280010200226002780043100748010770024400997009950194303337000180001000003001410001100017B004I003I008M01730AI004I002I006F002AI004J002007R010013001003106006025022011001017B005S02500146000130016700336008750098100406000930000200454010770103600995002660000401095021900254000021000020000300047000080001500004J002002R010004005007003000003002007009005J002003R011006041025001000005024021013005004F002S193  C22784 ᆱ ᅯ.w . ᆳ ン:z T00140903820020316796メ B000￁ .￁マ| { ￀ #ᆱム W000071001107001-1190034276044 m ￸ ヌᆵ]^￳&￘_v|bᄡ g ￐ ￑!チ ^ ᄊ ~ᆱf ￁  トM37889￸aᅱJ005006   H0000458835      ₩     
 ᅤ           |b            '? ᄀ ￯ ᅨJ000216 Q051210036045612X000222 JハkW ↓ Mv]] ヤCC8ヌt Yヤ!¥|bA ツh쐬U ¢ p'↓.N ᅫ Npᅨ9@ → GE#cᄀ> ;   ト  ■                モ JII
MI|bII|bI
IIBW       7ヌSヒ|b %  ￲bᅧ￸ワ  ￵Aᅬワ  ᅪノ￀u  00581092¢ト  ￀￧p  5|bR$ᅮᄅ  タZᄏ゚2チ  ￰￝ロ₩モ  *vk￐|  │ニᅱ¥リ  ￈]ᄊホ 000£6ᅪ
¦ラ  ᄃᄀ^￱フ  ミ￰I002r  R004009007209001ニ  ￈フᅬ 
 L0042796261004￁q |b I002N04100475g  pᆰᄑL0000393443000(ロᄊ M01024 ￵ ￑5￙ <  ¢ ￸ᅲ Q005120002031661  J002010R014007008005006000007004003005009001004000001J002002R008000006002004003004000006B005I010B003EAI009EI002AF002R007005001000001002000005F002B003I006EB004I007EJ002002F002I005EB007EB002I007B005EB027I047B009M26433B004EB005Z0000081801B003M04952B002M02159000000000000000000

显然,有些字符没有被正确编码或解码。但我无法弄清楚哪个......

非常欢迎任何想法或建议。感谢。

编辑:缺少部分代码:

令牌:

/** All lengths and heights in bits.
  * All 1's are to be ignored in writing
  * i.e 1 - 0 is transcoded as A.
  * 1 -1 is transcoded as E
 * 1 - 209 is transcoded as I209
 * 1 - 2 is transcoded as I002 **/
public enum Token {

A (1, 0),
B (8, 0),
C (16,0),
D (32,0),
E (1, 1),
F (8, 1),
G (16,1),
H (32,1),
I (1 ,8),
J (8, 8),
K (16,8),
L (32,8),
M (1,16),
N (8,16),
O (16,16),
P (32,16),
Q (16,32),
Z (1,32);


private final int length;
private final int height;


Token(int length, int height) {
    this.length = length;
    this.height = height;
}

public int getLength() {
    return length;
}

public int getHeight() {
    return height;
}

 }

Spectools,只是一个辅助类:

public class SPECTools {

public static byte[] encode2BitStringDecimal(int i) {
    byte[] result = new byte[2];
    result[0] = (byte) (i & 0xFF);
    result[1] = (byte) ((i >> 8) & 0xFF);
    return result;
}

public static byte[] encode4BitStringDecimal(int i) {
    byte[] result = new byte[4];
    result[0] = (byte) (i >> 24 & 0xFF);
    result[1] = (byte) (i >> 16 & 0xFF);
    result[2] = (byte) (i >> 8 & 0xFF);
    result[3] = (byte) (i & 0xFF) ;
    return result;
}

public static int convertFromByteArray2(byte byte1, byte byte2){
    return (byte2 & 0xFF) << 8 | (byte1 & 0xFF);
}

public static int convertFromByteArray4(byte byte1, byte byte2, byte byte3, byte byte4){
    return byte1 << 24 | (byte2 & 0xFF) << 16 | (byte3 & 0xFF) << 8 | (byte4 & 0xFF);
}

}

0 个答案:

没有答案