我正在研究直方图的字符串表示的自定义编码和解码。
String包含有关开始时直方图的信息,然后进入bin的标记和数字表示。令牌表示下一个“运行”箱中箱高度和长度的长度的细节。代币是A,B,C,D ......等。
下面的代码适用于许多直方图,但我发现了一些破坏代码的异常。我不知道为什么,如果编码或解码错误,我无法解决。所以我会发布两者。
下面有很多代码,如果您发现任何明显的错误或可能的错误,请告诉我。
我遇到的一个问题是,我无法分辨这是导致问题的编码或解码。关于如何区分这一点的任何想法都会有所帮助。
我已经建立了很多单元测试,除此之外,所有单元测试都通过了。
首先将字符串编码为字节表示的代码它基本上通过获取令牌值,获取字符串的下N个数字并将其转换为字节来实现:
/** takes in a rawSPEC and returns a byte array to be encoded
* @param rawSPEC raw SPEC
* @return byte array
*/
public static byte[] encodeRawSPEC(String rawSPEC){
//work out the size in advance.
List<Byte> byteList = new LinkedList<>();
//loop through the raw string
int length = 0;
int height = 0;
for(int i = 0; i < rawSPEC.length(); i++){
//read the token from the string
char token = rawSPEC.charAt(i);
byteList.add((byte)(token & 0xFF));
//find the length and height in bytes of the valid token
boolean nonSpecial = false;
for(Token t : Token.values()){
if(token == t.name().charAt(0)){
nonSpecial = true;
height = t.getHeight();
length = t.getLength();
}
}
if(nonSpecial) {
//read the height and length values
String tokenLength;
String tokenHeight;
if (length != 0 && length != 1) {
if (length == 8) {
//1 byte. just convert to byte and add
tokenLength = rawSPEC.substring(i + 1, i + 4);
i += 3;
byteList.add((byte)(Integer.parseInt(tokenLength)));
} else if (length == 16) {
//2 bytes. convert to byte array and add each sequencially
tokenLength = rawSPEC.substring(i + 1, i + 6);
i += 5;
byte[] byt = SPECTools.encode2BitStringDecimal(Integer.parseInt(tokenLength));
for(byte b : byt) byteList.add(b);
} else {
//4 bytes. convert to byte array and add each sequencially
tokenLength = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(tokenLength));
for(byte b : byt) byteList.add(b);
}
}
//encode any heights
if (height != 0 && height != 1) {
if (height == 8) {
tokenHeight = rawSPEC.substring(i + 1, i + 4);
i += 3;
byteList.add((byte)(Integer.parseInt(tokenHeight)));
} else if (height == 16) {
tokenHeight = rawSPEC.substring(i + 1, i + 6);
i += 5;
byte[] byt = SPECTools.encode2BitStringDecimal(Integer.parseInt(tokenHeight));
for(byte b : byt) byteList.add(b);
} else {
tokenHeight = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(tokenHeight));
for(byte b : byt) byteList.add(b);
}
}
}else if(token == 'R'){
String numReads = rawSPEC.substring(i+1, i+4);
byteList.add((byte)(Integer.parseInt(numReads)));
i+=3;
int reads = Integer.parseInt(numReads);
for(int j = 0; j < reads; j++){
String h = rawSPEC.substring(i+1, i+4);
i+=3;
byteList.add((byte)(Integer.parseInt(h)));
}
}else if(token == 'S'){
String numReads = rawSPEC.substring(i+1, i+4);
int a = Integer.parseInt(numReads);
byteList.add((byte)(Integer.parseInt(numReads)));
i+=3;
int reads = Integer.parseInt(numReads);
for(int j = 0; j < reads; j++){
String h = rawSPEC.substring(i+1, i+6);
i+=5;
byte[] byt = SPECTools.encode2BitStringDecimal(Integer.parseInt(h));
for(byte b : byt) byteList.add(b);
}
}else if(token == 'T') {
String h = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(h));
for (byte b : byt) byteList.add(b);
h = rawSPEC.substring(i + 1, i + 11);
i += 10;
byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(h));
for (byte b : byt) byteList.add(b);
}else if(token == 'U'){
String h = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(h));
for (byte b : byt) byteList.add(b);
}else if(token == 'V'){
for(int j = i; j < rawSPEC.length(); j++){
if(rawSPEC.charAt(j) == '0' && rawSPEC.charAt(j+1) == '0' && rawSPEC.charAt(j+1) == '0'){
String h = rawSPEC.substring(i+1, j);
for(char c : h.toCharArray()){
byteList.add((byte)(c & 0xFF));
}
i += h.length();
byteList.add((byte)0);
i+=3;
break;
}
}
}else if(token == 'W'){
//encode the 6 escape characters
for(int j = 0; j < 6; j ++){
String numReads = rawSPEC.substring(i+1, i+4);
byteList.add((byte)(Integer.parseInt(numReads)));
i+=3;
}
//encode the 4 Bit number
String number = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(number));
for(byte b : byt) byteList.add(b);
}else if(token == 'X'){
//code
String code = rawSPEC.substring(i+1, i+4);
byteList.add((byte)(Integer.parseInt(code)));
i+=3;
//statement
String statementLength = rawSPEC.substring(i+1, i+4);
int leng = Integer.parseInt(statementLength);
i+=3;
byteList.add((byte)(leng));
for(int j = i+1; j < i+1 + leng; j++){
byteList.add((byte)rawSPEC.charAt(j));
}
i+=leng;
//extension
String extension = rawSPEC.substring(i+1, i+6);
byte[] b = SPECTools.encode2BitStringDecimal(Integer.parseInt(extension));
for(byte by: b ){
byteList.add(by);
}
i+=5;
//endseq
int endLength = (Integer.parseInt(rawSPEC.substring(i+1, i+4)));
byteList.add((byte) endLength);
i+=3;
if(endLength != 0){
for(int j = i+1; j < i+1 + endLength; j++){
byteList.add((byte)rawSPEC.charAt(j));
}
i+=endLength;
}
//endseq
int flankLength = (Integer.parseInt(rawSPEC.substring(i+1, i+4)));
byteList.add((byte) flankLength);
i+=3;
if(flankLength != 0){
for(int j = i+1; j < i+1 + flankLength; j++){
byteList.add((byte)rawSPEC.charAt(j));
}
i+=flankLength;
}
}else if(token == 'Y'){
//must be Y
String Ybit = rawSPEC.substring(i + 1, i + 11);
i += 10;
byte[] byt = SPECTools.encode4BitStringDecimal(Integer.parseInt(Ybit));
for(byte b : byt) byteList.add(b);
}
}
//convert to array
byte[] byteArray = new byte[byteList.size()];
for(int i = 0; i < byteList.size(); i++) byteArray[i] = byteList.get(i);
return byteArray;
}
其次,解码代码。以另一种方式工作,抓住令牌。获取适当的字节数,然后将字节转换为整数并附加到string:
public static char[] decodeBinarySPECtoRAW(byte[] bytes){
StringBuilder sb = new StringBuilder();
int height = 0;
int length = 0;
int histogramLength = 0;
for (int i = 0; i < bytes.length; i++) {
char token = (char) bytes[i];
sb.append(token);
boolean nonSpecial = false;
for (Token t : Token.values()) {
if (token == t.name().charAt(0)) {
nonSpecial = true;
height = t.getHeight();
length = t.getLength();
}
}
if (nonSpecial) {
//length
if (length != 0 && length != 1) {
if (length == 8) {
//1 byte
sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
histogramLength+=bytes[i+1] & 0xFF;
i++;
} else if (length == 16) {
//2 bytes
histogramLength+= SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2]);
sb.append(String.format("%05d", SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2])));
i += 2;
} else {
//4 bytes
histogramLength += SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4]);
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
}
}else{
histogramLength++;
}
//height
if (height != 0 && height != 1) {
if (height == 8) {
//1 byte
sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
i++;
} else if (height == 16) {
//2 bytes
sb.append(String.format("%05d", SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2])));
i += 2;
} else {
//4 bytes
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
}
}
} else {
switch (token) {
case 'R': {
int numReads = (int)bytes[i + 1];
i++;
sb.append(String.format("%03d", numReads & 0xFF));
for (int j = 0; j < numReads; j++) {
int nextNum = bytes[i + 1];
sb.append(String.format("%03d", nextNum & 0xFF));
histogramLength++;
i++;
}
break;
}
case 'S': {
int numReads = (int)bytes[i + 1];
i++;
sb.append(String.format("%03d", numReads & 0xFF));
for (int j = 0; j < numReads; j++) {
histogramLength++;
int in = SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2]);
sb.append(String.format("%05d", in));
i += 2;
}
break;
}
case 'T': {
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
break;
}
case 'U': {
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
break;
}
case 'V': {
List<Byte> VBytes = new ArrayList<>();
boolean escapeFound = false;
while (!escapeFound) {
if( i + 1< bytes.length) {
if ( bytes[i + 1] == 0) escapeFound = true;
else {
VBytes.add(bytes[i + 1]);
i += 1;
}
}
}
for (byte b : VBytes) {
sb.append((char) b);
}
sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
i += 1;
break;
}
case 'W': {
for (int j = 0; j < 6; j++) {
sb.append(String.format("%03d", bytes[i + 1]));
i += 1;
}
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
break;
}
case 'X': {
sb.append(String.format("%03d", bytes[i + 1] & 0xFF));
i += 1;
//get length of the statement
int statementLength = bytes[i + 1] & 0xFF;
sb.append(String.format("%03d", statementLength));
i += 1;
for (int j = i + 1; j < i + 1 + statementLength; j++) {
sb.append((char) bytes[j]);
}
i += statementLength;
sb.append(String.format("%05d", SPECTools.convertFromByteArray2(bytes[i + 1], bytes[i + 2])));
i += 2;
//endseq
int endLength = bytes[i + 1];
sb.append(String.format("%03d", endLength));
i += 1;
if (endLength != 0) {
for (int j = i + 1; j < i + 1 + endLength; j++) {
sb.append((char) bytes[j]);
}
i += endLength;
}
//flankseq
int flankLength = bytes[i + 1];
sb.append(String.format("%03d", flankLength));
i += 1;
if (flankLength != 0) {
for (int j = i + 1; j < i + 1 + flankLength; j++) {
sb.append((char) bytes[j]);
}
i += flankLength;
}
break;
}
case 'Y': {
//must be Y
sb.append(String.format("%010d", SPECTools.convertFromByteArray4(bytes[i + 1], bytes[i + 2], bytes[i + 3], bytes[i + 4])));
i += 4;
break;
}
}
}
}
return sb.toString().toCharArray();
}
最后调用它的测试,这包括破坏代码的字符串。 AFAIK,直方图生成的字符串输入有效。
@Test
public void non_workingsample() throws Exception{
String sample = "T00000000010000000001VTEL_Telomeric_repeat_Mus_musculus000X000008pu
byte[] b = APUChunkEncoder.encodeRawSPEC(sample);
//debugging
for(byte d : b){
System.out.println((char) d + " " + (int)d);
}
char[] result = APUChunkDecoder.decodeBinarySPECtoRAW(b);
assertEquals(sample, String.valueOf(result));
}
测试结果:
T00000000010000000001VTEL_Telomeric_repeat_Mus_musculus000X000008puᆱ ᅯ.w . ᆳ ン:z T00140903820020316796メ B000 .マ| { #ᆱム W000071001107001-1190034276044 m ヌᆵ]^&_v|bᄡ g !チ ^ ᄊ ~ᆱf トM37889aᅱJ005006 H0000458835 ₩
ᅤ |b '? ᄀ ᅨJ000216 Q051210036045612X000222 JハkW ↓ Mv]] ヤCC8ヌt Yヤ!¥|bA ツh쐬U ¢ p'↓.N ᅫ Npᅨ9@ → GE#cᄀ> ; ト ■ モ JII
MI|bII|bI
IIBW 7ヌSヒ|b % bᅧワ Aᅬワ ᅪノu 00581092¢ト p 5|bR$ᅮᄅ タZᄏ゚2チ ロ₩モ *vk| │ニᅱ¥リ ]ᄊホ 000£6ᅪ
¦ラ ᄃᄀ^フ ミI002r R004009007209001ニ フᅬ
L0042796261004q |b I002N04100475g pᆰᄑL0000393443000(ロᄊ M01024 5 < ¢ ᅲ Q005120002031661 J002010R014007008005006000007004003005009001004000001J002002R008000006002004003004000006B005I010B003EAI009EI002AF002R007005001000001002000005F002B003I006EB004I007EJ002002F002I005EB007EB002I007B005EB027I047B009M26433B004EB005Z0000081801B003M04952B002M02159000000000000000000
显然,有些字符没有被正确编码或解码。但我无法弄清楚哪个......
非常欢迎任何想法或建议。感谢。
编辑:缺少部分代码:
令牌:
/** All lengths and heights in bits.
* All 1's are to be ignored in writing
* i.e 1 - 0 is transcoded as A.
* 1 -1 is transcoded as E
* 1 - 209 is transcoded as I209
* 1 - 2 is transcoded as I002 **/
public enum Token {
A (1, 0),
B (8, 0),
C (16,0),
D (32,0),
E (1, 1),
F (8, 1),
G (16,1),
H (32,1),
I (1 ,8),
J (8, 8),
K (16,8),
L (32,8),
M (1,16),
N (8,16),
O (16,16),
P (32,16),
Q (16,32),
Z (1,32);
private final int length;
private final int height;
Token(int length, int height) {
this.length = length;
this.height = height;
}
public int getLength() {
return length;
}
public int getHeight() {
return height;
}
}
Spectools,只是一个辅助类:
public class SPECTools {
public static byte[] encode2BitStringDecimal(int i) {
byte[] result = new byte[2];
result[0] = (byte) (i & 0xFF);
result[1] = (byte) ((i >> 8) & 0xFF);
return result;
}
public static byte[] encode4BitStringDecimal(int i) {
byte[] result = new byte[4];
result[0] = (byte) (i >> 24 & 0xFF);
result[1] = (byte) (i >> 16 & 0xFF);
result[2] = (byte) (i >> 8 & 0xFF);
result[3] = (byte) (i & 0xFF) ;
return result;
}
public static int convertFromByteArray2(byte byte1, byte byte2){
return (byte2 & 0xFF) << 8 | (byte1 & 0xFF);
}
public static int convertFromByteArray4(byte byte1, byte byte2, byte byte3, byte byte4){
return byte1 << 24 | (byte2 & 0xFF) << 16 | (byte3 & 0xFF) << 8 | (byte4 & 0xFF);
}
}