我正在研究直方图的字符串表示的自定义编码和解码。
String包含有关开始时直方图的信息,然后进入bin的标记和数字表示。令牌表示下一个“运行”箱中箱高度和长度的长度的细节。代币是A,B,C,D ......等。
下面的代码适用于许多直方图,但我发现了一些破坏代码的异常。我不知道为什么,如果编码或解码错误,我无法解决。所以我会发布两者。
下面有很多代码,如果您发现任何明显的错误或可能的错误,请告诉我。
我遇到的一个问题是,我无法分辨这是导致问题的编码或解码。关于如何区分这一点的任何想法都会有所帮助。
我已经建立了很多单元测试,除此之外,所有单元测试都通过了。
首先将字符串编码为字节表示的代码它基本上通过获取令牌值,获取字符串的下N个数字并将其转换为字节来实现:
/** takes in a rawSPEC and returns a byte array to be encoded
* @param rawSPEC raw SPEC
* @return byte array
*/
public static byte[] encodeRawSPEC(String rawSPEC){
//work out the size in advance.
List<Byte> byteList = new LinkedList<>();
//loop through the raw string
int length = 0;
int height = 0;
for(int i = 0; i < rawSPEC.length(); i++){
//read the token from the string
char token = rawSPEC.charAt(i);
byteList.add((byte)(token & 0xFF));
//find the length and height in bytes of the valid token
boolean nonSpecial = false;
for(Token t : Token.values()){
if(token == t.name().charAt(0)){
nonSpecial = true;
height = t.getHeight();
length = t.getLength();
}
}
if(nonSpecial) {
//read the height and length values
String tokenLength;
String tokenHeight;
if (length != 0 && length != 1) {
if (length == 8) {
//1 byte. just convert to byte and add
tokenLength = rawSPEC.substring(i + 1, i + 4);
i += 3;
byteList.add((byte)(Integer.parseInt(tokenLength)));
} else if (length == 16) {
//2 bytes. convert to byte array and add each sequencially
tokenLength = rawSPEC.substring(i + 1, i + 6);
i += 5;
byte[] byt = SPECTools.encode2BitStringDecimal(Integer.parseInt(tokenLength));
for(byte b : byt) byteList.add(b);
} else {
//4 bytes. convert to byte array and add each sequencially
tokenLength = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(tokenLength));
for(byte b : byt) byteList.add(b);
}
}
//encode any heights
if (height != 0 && height != 1) {
if (height == 8) {
tokenHeight = rawSPEC.substring(i + 1, i + 4);
i += 3;
byteList.add((byte)(Integer.parseInt(tokenHeight)));
} else if (height == 16) {
tokenHeight = rawSPEC.substring(i + 1, i + 6);
i += 5;
byte[] byt = SPECTools.encode2BitStringDecimal(Integer.parseInt(tokenHeight));
for(byte b : byt) byteList.add(b);
} else {
tokenHeight = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(tokenHeight));
for(byte b : byt) byteList.add(b);
}
}
}else if(token == 'R'){
String numReads = rawSPEC.substring(i+1, i+4);
byteList.add((byte)(Integer.parseInt(numReads)));
i+=3;
int reads = Integer.parseInt(numReads);
for(int j = 0; j < reads; j++){
String h = rawSPEC.substring(i+1, i+4);
i+=3;
byteList.add((byte)(Integer.parseInt(h)));
}
}else if(token == 'S'){
String numReads = rawSPEC.substring(i+1, i+4);
int a = Integer.parseInt(numReads);
byteList.add((byte)(Integer.parseInt(numReads)));
i+=3;
int reads = Integer.parseInt(numReads);
for(int j = 0; j < reads; j++){
String h = rawSPEC.substring(i+1, i+6);
i+=5;
byte[] byt = SPECTools.encode2BitStringDecimal(Integer.parseInt(h));
for(byte b : byt) byteList.add(b);
}
}else if(token == 'T') {
String h = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(h));
for (byte b : byt) byteList.add(b);
h = rawSPEC.substring(i + 1, i + 11);
i += 10;
byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(h));
for (byte b : byt) byteList.add(b);
}else if(token == 'U'){
String h = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(h));
for (byte b : byt) byteList.add(b);
}else if(token == 'V'){
for(int j = i; j < rawSPEC.length(); j++){
if(rawSPEC.charAt(j) == '0' && rawSPEC.charAt(j+1) == '0' && rawSPEC.charAt(j+1) == '0'){
String h = rawSPEC.substring(i+1, j);
for(char c : h.toCharArray()){
byteList.add((byte)(c & 0xFF));
}
i += h.length();
byteList.add((byte)0);
i+=3;
break;
}
}
}else if(token == 'W'){
//encode the 6 escape characters
for(int j = 0; j < 6; j ++){
String numReads = rawSPEC.substring(i+1, i+4);
byteList.add((byte)(Integer.parseInt(numReads)));
i+=3;
}
//encode the 4 Bit number
String number = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(number));
for(byte b : byt) byteList.add(b);
}else if(token == 'X'){
//code
String code = rawSPEC.substring(i+1, i+4);
byteList.add((byte)(Integer.parseInt(code)));
i+=3;
//statement
String statementLength = rawSPEC.substring(i+1, i+4);
int leng = Integer.parseInt(statementLength);
i+=3;
byteList.add((byte)(leng));
for(int j = i+1; j < i+1 + leng; j++){
byteList.add((byte)rawSPEC.charAt(j));
}
i+=leng;
//extension
String extension = rawSPEC.substring(i+1, i+6);
byte[] b = SPECTools.encode2BitStringDecimal(Integer.parseInt(extension));
for(byte by: b ){
byteList.add(by);
}
i+=5;
//endseq
int endLength = (Integer.parseInt(rawSPEC.substring(i+1, i+4)));
byteList.add((byte) endLength);
i+=3;
if(endLength != 0){
for(int j = i+1; j < i+1 + endLength; j++){
byteList.add((byte)rawSPEC.charAt(j));
}
i+=endLength;
}
//endseq
int flankLength = (Integer.parseInt(rawSPEC.substring(i+1, i+4)));
byteList.add((byte) flankLength);
i+=3;
if(flankLength != 0){
for(int j = i+1; j < i+1 + flankLength; j++){
byteList.add((byte)rawSPEC.charAt(j));
}
i+=flankLength;
}
}else if(token == 'Y'){
//must be Y
String Ybit = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(Ybit));
for(byte b : byt) byteList.add(b);
}
}
//convert to array
byte[] byteArray = new byte[byteList.size()];
for(int i = 0; i < byteList.size(); i++) byteArray[i] = byteList.get(i);
return byteArray;
}
其次,解码代码。以另一种方式工作,抓住令牌。获取适当的字节数,然后将字节转换为整数并附加到string:
public static char[] decodeBinarySPECtoRAW(byte[] bytes){
StringBuilder sb = new StringBuilder();
int height = 0;
int length = 0;
int histogramLength = 0;
for (int i = 0; i < bytes.length; i++) {
char token = (char) bytes[i];
sb.append(token);
boolean nonSpecial = false;
for (Token t : Token.values()) {
if (token == t.name().charAt(0)) {
nonSpecial = true;
height = t.getHeight();
length = t.getLength();
}
}
if (nonSpecial) {
//length
if (length != 0 && length != 1) {
if (length == 8) {
//1 byte
sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
histogramLength+=bytes[i+1] & 0xFF;
i++;
} else if (length == 16) {
//2 bytes
histogramLength+= SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2]);
sb.append(String.format("%05d", SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2])));
i += 2;
} else {
//4 bytes
histogramLength += SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4]);
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
}
}else{
histogramLength++;
}
//height
if (height != 0 && height != 1) {
if (height == 8) {
//1 byte
sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
i++;
} else if (height == 16) {
//2 bytes
sb.append(String.format("%05d", SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2])));
i += 2;
} else {
//4 bytes
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
}
}
} else {
switch (token) {
case 'R': {
int numReads = (int)bytes[i + 1];
i++;
sb.append(String.format("%03d", numReads & 0xFF));
for (int j = 0; j < numReads; j++) {
int nextNum = bytes[i + 1];
sb.append(String.format("%03d", nextNum & 0xFF));
histogramLength++;
i++;
}
break;
}
case 'S': {
int numReads = (int)bytes[i + 1];
i++;
sb.append(String.format("%03d", numReads & 0xFF));
for (int j = 0; j < numReads; j++) {
histogramLength++;
int in = SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2]);
sb.append(String.format("%05d", in));
i += 2;
}
break;
}
case 'T': {
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
break;
}
case 'U': {
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
break;
}
case 'V': {
List<Byte> VBytes = new ArrayList<>();
boolean escapeFound = false;
while (!escapeFound) {
if( i + 1< bytes.length) {
if ( bytes[i + 1] == 0) escapeFound = true;
else {
VBytes.add(bytes[i + 1]);
i += 1;
}
}
}
for (byte b : VBytes) {
sb.append((char) b);
}
sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
i += 1;
break;
}
case 'W': {
for (int j = 0; j < 6; j++) {
sb.append(String.format("%03d", bytes[i + 1]));
i += 1;
}
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
break;
}
case 'X': {
sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
i += 1;
//get length of the statement
int statementLength = bytes[i + 1] & 0xFF;
sb.append(String.format("%03d", statementLength));
i += 1;
for (int j = i + 1; j < i + 1 + statementLength; j++) {
sb.append((char) bytes[j]);
}
i += statementLength;
sb.append(String.format("%05d", SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2])));
i += 2;
//endseq
int endLength = bytes[i + 1];
sb.append(String.format("%03d", endLength));
i += 1;
if (endLength != 0) {
for (int j = i + 1; j < i + 1 + endLength; j++) {
sb.append((char) bytes[j]);
}
i += endLength;
}
//flankseq
int flankLength = bytes[i + 1];
sb.append(String.format("%03d", flankLength));
i += 1;
if (flankLength != 0) {
for (int j = i + 1; j < i + 1 + flankLength; j++) {
sb.append((char) bytes[j]);
}
i += flankLength;
}
break;
}
case 'Y': {
//must be Y
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
break;
}
}
}
}
return sb.toString().toCharArray();
}
最后调用它的测试,这包括破坏代码的字符串。 AFAIK,直方图生成的字符串输入有效。
@Test
public void non_workingsample() throws Exception{
String sample = "T00000000010000000001VTEL_Telomeric_repeat_Mus_musculus000X000008pu1.AE.100001000000W0000000000000000000000000101Y0000001236M00705I106I022I010I007I018I002I014I012EI004I020I006I011I004F002R028004007004008036012001017010007087005007010005008053001008003010005032000009003005007F002R019006002010004001000010006002004001000014004007003000001007J002004I007B002R010003001009007001003006002004006B002R010008000006005002000005006009010F002R007009004006005002000008J002005R053004002001005000006008044010006002004002039006009002012009046011012004009002046008013002010005067002012002007001042004010003011003040004006002011007025001014001J002006R017016004005003012006011000006002006005004001006001008J002003R009001014002006005000002007002J002005R038003002004002008011003001003001012004003000002003013004001000006000008006003001005002005010003002007000009003004002J002004R014005006001002008003006005130003006004006009B002R007001005009005001000006J002002I005B002R016018002007006002000009000005003000002003002004010F002R022004002004003001000007000007008001003006003004006002000005004011002F002R010005004008006000002004000004008J002002R004008002005003B002I006I003J002007R006000001005002011004B002R028004003014003001000009003007010003000006003010008000001006003012009000001008002012006F002I008AI007F003R022004001006011002000011004007005001000007003008004002000010001010006F002J002006I012I009B002R004006003015003J002002R016012004012002001002003002007002001000009005012010B002R014005000011006002000005002024008000006009002J002008R017004002009001009012003000005003004002003000010003005J002004S096000010000400003000060000900011000010000700002000070001700009000020001000002000050000800011000100000700006000370266000176000650021600022001050059100250000950002900091002720040900384002580024600104001490062400445001350006500185001730037500322002530004600149001710036700317001500004600104001790047300405001510005900090001640032100329001280003000073001580032100390001000003200076001070028600327013140004900103001380035300314002340004500208001830051000486002340000900003000180000100004J002002AI002EB002EB003F002B003I002B012S042000030000100002000070002401180006020016000658006480119001936003840008700339003650050200471002680004400234001750027301298002280010200226002780043100748010770024400997009950194303337000180001000003001410001100017B004I003I008M01730AI004I002I006F002AI004J002007R010013001003106006025022011001017B005S02500146000130016700336008750098100406000930000200454010770103600995002660000401095021900254000021000020000300047000080001500004J002002R010004005007003000003002007009005J002003R011006041025001000005024021013005004F002S19300011000030006700089001710046800302001190004600173002720041300314001220008400215003980056600636001460006600193003020044900399001240012300192002910051600427001450008700327003630064900523002040010900248003910068700605003500536300550007280137502166001800010300208009770054501811001290009400182002680051300382027310010200193002550050700388003330066000504006090123801354000060000200004000720000701363000060000000002000050000600998000020000000004000030001303781000070000000009000020001800015000040000800006000110002400009000050000100000000030000600807000630016100239004590051900283000740021600337005320080600300000880022200330006500076200363000870023600333006300077500349000930091600323005790082400391001160034500404008010227700833002490064200872019740071900287000850022400368005510074800302000780020600334006240071500313000640023500327007841768101020002910086701185023660031500012000030000500132000050100500002000030000100002000090000200005000000000100005000040629100006J002005I016I013M00268I008I005I008I010I021I007B002W0000000000000000000000472967S13900008000370001400498005110137801226005040015600006005010108901306004630015600002004610104801417004480011700006005810111601281004800013200003005230121601281004870011200001021010110601316004670016900003006400111401467129590012900005004960098904763004860014700005005540114201387004640012400007004880115801238004850015200006004560111701718005260014200004004810133403533004840015100003004230144104702004970014000006005260142401264005850011400004005190110601801004650013400005004560116401302004630012700010005880116501253004490011300008005850104110574004750010300007002750062405034014690007600006002270055200667001820007700004002450046500565002170006000001002240050404887002150008100002000310004500023J002010R014007008005006000007004003005009001004000001J002002R008000006002004003004000006B005I010B003EAI009EI002AF002R007005001000001002000005F002B003I006EB004I007EJ002002F002I005EB007EB002I007B005EB027I047B009M26433B004EB005Z0000081801B003M04952B002M02159000000000000000000";
byte[] b = APUChunkEncoder.encodeRawSPEC(sample);
//debugging
for(byte d : b){
System.out.println((char) d + " " + (int)d);
}
char[] result = APUChunkDecoder.decodeBinarySPECtoRAW(b);
assertEquals(sample, String.valueOf(result));
}
测试结果:
T00000000010000000001VTEL_Telomeric_repeat_Mus_musculus000X000008pu1.AE.100001000000W0000000000000000000000000101Y0000001236M00705I106I022I010I007I018I002I014I012EI004I020I006I011I004F002R028004007004008036012001017010007087005007010005008053001008003010005032000009003005007F002R019006002010004001000010006002004001000014004007003000001007J002004I007B002R010003001009007001003006002004006B002R010008000006005002000005006009010F002R007009004006005002000008J002005R053004002001005000006008044010006002004002039006009002012009046011012004009002046008013002010005067002012002007001042004010003011003040004006002011007025001014001J002006R017016004005003012006011000006002006005004001006001008J002003R009001014002006005000002007002J002005R038003002004002008011003001003001012004003000002003013004001000006000008006003001005002005010003002007000009003004002J002004R014005006001002008003006005130003006004006009B002R007001005009005001000006J002002I005B002R016018002007006002000009000005003000002003002004010F002R022004002004003001000007000007008001003006003004006002000005004011002F002R010005004008006000002004000004008J002002R004008002005003B002I006I003J002007R006000001005002011004B002R028004003014003001000009003007010003000006003010008000001006003012009000001008002012006F002I008AI007F003R022004001006011002000011004007005001000007003008004002000010001010006F002J002006I012I009B002R004006003015003J002002R016012004012002001002003002007002001000009005012010B002R014005000011006002000005002024008000006009002J002008R017004002009001009012003000005003004002003000010003005J002004S096000010000400003000060000900011000010000700002000070001700009000020001000002000050000800011000100000700006000370266000176000650021600022001050059100250000950002900091002720040900384002580024600104001490062400445001350006500185001730037500322002530004600149001710036700317001500004600104001790047300405001510005900090001640032100329001280003000073001580032100390001000003200076001070028600327013140004900103001380035300314002340004500208001830051000486002340000900003000180000100004J002002AI002EB002EB003F002B003I002B012S042000030000100002000070002401180006020016000658006480119001936003840008700339003650050200471002680004400234001750027301298002280010200226002780043100748010770024400997009950194303337000180001000003001410001100017B004I003I008M01730AI004I002I006F002AI004J002007R010013001003106006025022011001017B005S02500146000130016700336008750098100406000930000200454010770103600995002660000401095021900254000021000020000300047000080001500004J002002R010004005007003000003002007009005J002003R011006041025001000005024021013005004F002S193 C22784 ᆱ ᅯ.w . ᆳ ン:z T00140903820020316796メ B000 .マ| { #ᆱム W000071001107001-1190034276044 m ヌᆵ]^&_v|bᄡ g !チ ^ ᄊ ~ᆱf トM37889aᅱJ005006 H0000458835 ₩
ᅤ |b '? ᄀ ᅨJ000216 Q051210036045612X000222 JハkW ↓ Mv]] ヤCC8ヌt Yヤ!¥|bA ツh쐬U ¢ p'↓.N ᅫ Npᅨ9@ → GE#cᄀ> ; ト ■ モ JII
MI|bII|bI
IIBW 7ヌSヒ|b % bᅧワ Aᅬワ ᅪノu 00581092¢ト p 5|bR$ᅮᄅ タZᄏ゚2チ ロ₩モ *vk| │ニᅱ¥リ ]ᄊホ 000£6ᅪ
¦ラ ᄃᄀ^フ ミI002r R004009007209001ニ フᅬ
L0042796261004q |b I002N04100475g pᆰᄑL0000393443000(ロᄊ M01024 5 < ¢ ᅲ Q005120002031661 J002010R014007008005006000007004003005009001004000001J002002R008000006002004003004000006B005I010B003EAI009EI002AF002R007005001000001002000005F002B003I006EB004I007EJ002002F002I005EB007EB002I007B005EB027I047B009M26433B004EB005Z0000081801B003M04952B002M02159000000000000000000
显然,有些字符没有被正确编码或解码。但我无法弄清楚哪个......
非常欢迎任何想法或建议。感谢。
编辑:缺少部分代码:
令牌:
/** All lengths and heights in bits.
* All 1's are to be ignored in writing
* i.e 1 - 0 is transcoded as A.
* 1 -1 is transcoded as E
* 1 - 209 is transcoded as I209
* 1 - 2 is transcoded as I002 **/
public enum Token {
A (1, 0),
B (8, 0),
C (16,0),
D (32,0),
E (1, 1),
F (8, 1),
G (16,1),
H (32,1),
I (1 ,8),
J (8, 8),
K (16,8),
L (32,8),
M (1,16),
N (8,16),
O (16,16),
P (32,16),
Q (16,32),
Z (1,32);
private final int length;
private final int height;
Token(int length, int height) {
this.length = length;
this.height = height;
}
public int getLength() {
return length;
}
public int getHeight() {
return height;
}
}
Spectools,只是一个辅助类:
public class SPECTools {
public static byte[] encode2BitStringDecimal(int i) {
byte[] result = new byte[2];
result[0] = (byte) (i & 0xFF);
result[1] = (byte) ((i >> 8) & 0xFF);
return result;
}
public static byte[] encode4BitStringDecimal(int i) {
byte[] result = new byte[4];
result[0] = (byte) (i >> 24 & 0xFF);
result[1] = (byte) (i >> 16 & 0xFF);
result[2] = (byte) (i >> 8 & 0xFF);
result[3] = (byte) (i & 0xFF) ;
return result;
}
public static int convertFromByteArray2(byte byte1, byte byte2){
return (byte2 & 0xFF) << 8 | (byte1 & 0xFF);
}
public static int convertFromByteArray4(byte byte1, byte byte2, byte byte3, byte byte4){
return byte1 << 24 | (byte2 & 0xFF) << 16 | (byte3 & 0xFF) << 8 | (byte4 & 0xFF);
}
}