Question

有一个txt文件，其中18列由''分隔，并以,分隔，其中每一行代表sqlite query的插入语句：

    (1999,1999,1999,1999,1999,0,0,'flaggr.png',261,     'Βάκχειος',             'Spl-up','B ',  'Pagrati/Athens,Attica,Greece',     'N/A',   'Hellenic Mythology',      '','', ''),
    (2000,2000,2000,2000,2000,0,2010,'flagru.png',3340, 'Анклав Снов',              'Act',    'G/D ',   'Bryansk,Russia',       '2008-2009(as Vampire''s Crypt),2010-present',   'N/A',     '','', ''),
    (2001,2001,2001,2001,2001,0,2002,'flagru.png',271,  'Аркона',               'Act','P/FO ',  'Moscow,Russia',        '2002(as Гиперборея),2002-present',  'Slavic Pism and FOtales, Legends, Mythology',     '', '', ''),
    (2002,2002,2002,2002,2002,0,1988,'flagru.png',470,      'Аспид',                'Spl-up','PROG ',   'Volgodonsk,Rostovregion,Russia',       '1988-1997,2010-?',  'Politics, Horror, Death',     '', '', ''),
    (2003,2003,2003,2003,2003,0,2000,'flagua.png',359,  'Ірій',             'Unknown','FO D /G ',   'Lviv,Ukraine',     '2000-?',    'Slavic mythology, Ukrainian FOlore',      '', '', ''),
    (2004,2004,2004,2004,2004,0,2011,'flagru.png',3036579,  'Лесьяр',               'Act','P FO ',  'Moscow,Russia',        '2011-present',  'Pism, FOlore, Social matters, Feelings',      '', '', ''),
    (2005,2005,2005,2005,2005,0,2003,'flagru.png',218,  'М8Л8ТХ',               'Act','B  with RAC',    'Tver,Ukraine(posterior),Russia',       '2003-present',  'National Pride, National Socialism, Hatred, War, Intolerance, Pism',      '', '', ''),
    (2006,2006,2006,2006,2006,0,0,'flagru.png',354037,      'Рельос',               'Act','PR/POST-/ (early), G/POST-, Ambient (later)',    'Baltiisk,Kaliningradregion,Russia',        'N/A',   'N/A',     '', '',''),
    (2007,2007,2007,2007,2007,0,2006,'flagru.png',32937,    'Сивый Яр',             'Act','P/POST-B ',  'Vyritsa,Leningradregion,Russia',       '2006-present',  'Pism, Pride, Heritage, Poetry, Slavonic Mythology',       '', '', ''),
    (2008,2008,2008,2008,2008,0,2001,'flagru.png',44,       'Темнозорь',                'Act','FO/B ',  'Moscow,Russia',        '2001-present',  'Nature, Slavonic Pism, War, Right-wing nationalism',      '4394', '', ''),
    (2009,2009,2009,2009,2009,0,1993,'flagru.png',80,       'Эпидемия',             'Act','Pow ',   'Moscow,Russia',        '1993-present',  'Fantasy, Tolkien, Elves',     '', '', ''),
    (2010,2010,2010,2010,2010,0,0,'flagjp.png',354039,      'こくまろみるく',              'Act','G/Pow ', 'N/A,Japan',        'N/A',   'Bizarre, Macabre',        '', '', ''),
    (2011,2011,2011,2011,2011,0,2012,'flagus.png',38723,    'מזמור',                'Act','B/Drone/D ', 'Portland,Oregon,United States',        '2012-present',  'N/A',     '', '', ''),
    (2012,2012,2012,2012,2012,0,2004,'flaglb.png',67,   'دمار',             'Spl-up','B/Death ',    'Hamra,Beirut,Lebanon',     '2004-2006',     'War, Pride, Blasphemy, Supremacy',        '', '', ''),
    (2013,2013,2013,2013,2013,0,2006,'flagcn.png',760,  '原罪',               'Act','B  (early), G/B  (later)',   'Chengdu,SichuanProvince,China',        '2006-present',  'Misanthropy, Hatred, Depression, War, Revelation',        '', '', ''),
    (2014,2014,2014,2014,2014,0,1995,'flagtw.png',443,      '閃靈',               'Act','Melodic B/Death/FO ',    'Taipei,Taiwan',        '1995-present',  'Taiwanese Myths and Legends, Anti-Fascism, History',      '4443', '', ''),
    (2015,2015,2015,2015,2015,0,2001,'flagjp.png',31450,    '電気式華憐音楽集団',                'Act','Pow/G',  'N/A,Japan',        '2001-present',  'Anime, Fantasy, Liberty',     '', '', '');

对齐所有列的最佳方法是什么，例如前两行变为：

(1999,1999,1999,1999,1999,0,0,   'flaggr.png',261,  'Βάκχειος',     'Spl-up',   'B ',   'Pagrati/Athens,Attica,Greece', 'N/A',                                          'Hellenic Mythology',   '','', ''),
(2000,2000,2000,2000,2000,0,2010,'flagru.png',3340, 'Анклав Снов',  'Act',      'G/D ', 'Bryansk,Russia',               '2008-2009(as Vampire''s Crypt),2010-present',  'N/A',                  '','', ''),

我在想：

使用逗号分隔文件中的所有行字符串作为分隔符
计算每列的最大长度并将其存储在内存中
再次循环文件，但这次使用计算的最大长度和写入输出

我提供的代码如下所示，但是我意识到一个问题，有一些列在单引号中有逗号，如'bla1,bla2,bla3' (columns 12 to 18 could have inner commas...) 所以如果我用逗号分割字符串，我就不会得到18列。

在那个问题之后我不知道如何继续...... 用逗号分割的方法是什么，但考虑一些字符串的单引号？

    private static void AdjustColumnsInFile(string filePath, string outputFile)
    {
        //array to store max size of each column
        int[] sizes = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
        foreach (var line in File.ReadLines(filePath))
        {
            var words = line.Split(',');
            if (words.Length == 18)
            {
                var i = 0;
                //get max value of each column
                foreach (var word in words)
                {
                    sizes[i] = sizes[i] < word.Length ? word.Length : sizes[i];
                    i++;
                }
            }
        }

        ...

        using (var sw = new StreamWriter(outputFile))
        {
            foreach (var l in newLines)
            {
                sw.WriteLine($"{l}");
            }
        }
    }

Answer 1

据我所知，你唯一的问题是如何在逗号上拆分字符串，因为有些逗号可能出现在''引号内。你可以用正则表达式来做到这一点：

,(?=(?:[^\']*\'[^\']*\')*[^\']*$)

它基本上匹配逗号，后跟零或偶数引号（'）。如果逗号出现在''引号内 - 在有效字符串中，它后面会出现奇数引号，因此不匹配。

其余应该很容易，首先计算尺寸：

//array to store max size of each column
int[] sizes = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
foreach (var line in File.ReadLines(filePath)) {
    var tmp = line.Trim(); // remove leading and trailing whitespace
    tmp = tmp.Remove(tmp.Length - 2, 2); // remove closing ) and , or ;
    tmp = tmp.Remove(0, 1); // remove opening (   
    // split by comma                 
    var words = Regex.Split(tmp, @",(?=(?:[^\']*\'[^\']*\')*[^\']*$)");
    if (words.Length == 18) {
        for (int i = 0; i < words.Length; i++) {
            var word = words[i].Trim(); // remove whitespace
            sizes[i] = sizes[i] < word.Length ? word.Length : sizes[i];
        }
    }
    else throw new Exception("Invalid number of columns");
}

然后重复并将空格附加到与预期大小不匹配的列：

using (var writer = new StreamWriter(outputFile)) {
    foreach (var line in File.ReadLines(filePath)) {                    
        var tmp = line.Trim(); // remove trailing whitespace
        bool hadTrailingComma = tmp.EndsWith(",");
        tmp = tmp.Remove(tmp.Length - 2, 2); // remove closing ) and , or ;
        tmp = tmp.Remove(0, 1); // remove opening (                                                            
        var words = Regex.Split(tmp, @",(?=(?:[^\']*\'[^\']*\')*[^\']*$)");
        var newLine = String.Join(",", words.Select((w, i) =>
        {
            w = w.Trim();
            var targetSize = sizes[i];
            if (w.Length < targetSize)
                return w + new string(' ', targetSize - w.Length); // append spaces until max length
            return w;
        }));

        writer.WriteLine($"({newLine}){(hadTrailingComma ? "," : ";")}");
    }
}

请注意，由于こくまろみるく之类的unicode字符，您的输出文件可能看起来未正确对齐，而实际上它是（即每列的字符大小相同）。

如何根据每列的最大长度在txt文件中对齐不同长度的列？

1 个答案: