我正在尝试将字符串拆分为字符串数组。 我当前的字符串看起来像这样,并且全部在一个字符串中。它还具有换行符(\ r \ n)和空格。我在这里举了一个看起来更好的例子。
BFFPPB14 Dark Chocolate Dried Cherries 14 oz (397g)
INGREDIENTS: DARK CHOCOLATE (SUGAR, CHOCOLATE LIQUOR, COCOA BUTTER,
ANHYDROUS MILK FAT, SOYA LECITHIN, VANILLIN [AN ARTIFICIAL FLAVOR]), DRIED
TART CHERRIES (CHERRIES, SUGAR), GUM ARABIC, CONFECTIONER'S GLAZE.
CONTAINS: MILK, SOY
ALLERGEN INFORMATION: MAY CONTAIN TREE NUTS, PEANUTS, EGG AND
WHEAT.
01/11/2019
Description: Sweetened dried Montmorency cherries that are panned with dark chocolate.
Storage Conditions: Store at ambient temperatures with a humidity less than 50%.
Shelf Life: 9 months
Company Name
Item No.: 701804
Bulk: 415265
Supplier: Cherryland's Best
WARNING: CHERRIES MAY CONTAIN PITS
我的Regex看起来像这样
List<string> result = Regex.Split(text, @"INGREDIENTS: |CONTAINS: |ALLERGEN INFORMATION: |(\d{1,2}/\d{1,2}/\d{2,4})|Description: |Storage Conditions: |Shelf Life: |Company Name|Item No.: |Bulk: |Supplier: |WARNING: ").ToList();
This is what result looks like
注意:第一个字符串是产品名称
有时候我收到没有供应商或警告的字符串,如果拆分未找到拆分值,我希望拆分具有空字符串。
EX:
result[0] = "blabla"
result[1] = ""
result[2] = "blabla"
这样我就知道结果1是在值(INGREDIENTS:)上分割的,我可以将其分配给某些东西
答案 0 :(得分:0)
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace ConsoleApp12
{
class Program
{
public static void Main(string[] args)
{
// test string
var str = @"BFFPPB10 Dark Chocolate Macadamia Nuts 11 oz (312g)\r\nINGREDIENTS: DARK CHOCOLATE (SUGAR, CHOCOLATE, COCOA BUTTER, \r\nANHYDROUS MILK FAT, SOY LECITHIN, VANILLA), MACADAMIA NUTS, SEA SALT.\r\nCONTAINS: MACADAMIA NUTS, MILK, SOY.\r\nALLERGEN INFORMATION: MAY CONTAIN OTHER TREE NUTS, PEANUTS, EGG AND\r\nWHEAT.\r\n01/11/2019\r\nDescription: Dry roasted, salted macadamias covered in dark chocolate.\r\nStorage Conditions: Store at ambient temperatures with a humidity less than 50%. \r\nShelf Life: 12 months\r\nBlain's Farm & Fleet\r\nItem No.: 701772\r\nBulk: 421172\r\nSupplier: Devon's\r\n";
// Keys
const string KEY_INGREDIENTS = "INGREDIENTS:";
const string KEY_CONTAINS = "CONTAINS:";
const string KEY_ALLERGEN_INFORMATION = "ALLERGEN INFORMATION:";
const string KEY_DESCRPTION = "Description:";
const string KEY_STORAGE_CONDITION = "Storage Conditions:";
const string KEY_SHELFLIFE = "Shelf Life:";
const string KEY_ITEM_NO = "Item No.:";
const string KEY_BULK = "Bulk:";
const string KEY_SUPPLIER = "Supplier:";
const string KEY_WARNING = "WARNING:";
const string KEY_YEAR_Regex = @"^\d{1,2}/\d{1,2}/\d{4}$";
const string KEY_AFTER_COMPANY_NAME = KEY_ITEM_NO;
// Helpers
var keys = new string[]
{ KEY_INGREDIENTS, KEY_CONTAINS, KEY_ALLERGEN_INFORMATION, KEY_DESCRPTION, KEY_STORAGE_CONDITION,
KEY_SHELFLIFE, KEY_ITEM_NO, KEY_BULK, KEY_SUPPLIER, KEY_WARNING };
var lines = str.Split(new string[] { @"\r\n" }, StringSplitOptions.RemoveEmptyEntries);
void log(string key, string val)
{
Console.WriteLine($"{key} => {val}");
Console.WriteLine();
}
void removeLine(string line)
{
if (line != null) lines = lines.Where(w => w != line).ToArray();
}
// get Multi Line Item with key
string getMultiLine(string key)
{
var line = lines
.Select((linetxt, index) => new { linetxt, index })
.Where(w => w.linetxt.StartsWith(key))
.FirstOrDefault();
if (line == null) return string.Empty;
var result = line.linetxt;
for (int i = line.index + 1; i < lines.Length; i++)
{
if (!keys.Any(a => lines[i].StartsWith(a)))
result += lines[i];
else
break;
}
return result;
}
// get single Line Item before spesic key if the Line is not a key
string getLinebefore(string the_after_key)
{
var the_after_line = lines
.Select((linetxt, index) => new { linetxt, index })
.Where(w => w.linetxt.StartsWith(the_after_key))
.FirstOrDefault();
if (the_after_line == null) return string.Empty;
var the_before_line_text = lines[the_after_line.index - 1];
//not a key
if (!keys.Any(a => the_before_line_text.StartsWith(a)))
return the_before_line_text;
else
return null;
}
// 1st get item without key
var itemName = lines.FirstOrDefault();
removeLine(itemName);
var year = lines.Where(w => Regex.Match(w, KEY_YEAR_Regex).Success).FirstOrDefault();
removeLine(year);
var companyName = getLinebefore(KEY_AFTER_COMPANY_NAME);
removeLine(companyName);
//2nd get item with Keys
var ingredients = getMultiLine(KEY_INGREDIENTS);
var contanins = getMultiLine(KEY_CONTAINS);
var allergenInfromation = getMultiLine(KEY_ALLERGEN_INFORMATION);
var description = getMultiLine(KEY_DESCRPTION);
var storageConditions = getMultiLine(KEY_STORAGE_CONDITION);
var shelfLife = getMultiLine(KEY_SHELFLIFE);
var itemNo = getMultiLine(KEY_ITEM_NO);
var bulk = getMultiLine(KEY_BULK);
var supplier = getMultiLine(KEY_SUPPLIER);
var warning = getMultiLine(KEY_WARNING);
// 3rd log
log("ItemName", itemName);
log("Ingredients", ingredients);
log("contanins", contanins);
log("Allergen Infromation", allergenInfromation);
log("Year", year);
log("Description", description);
log("Storage Conditions", storageConditions);
log("Shelf Life", shelfLife);
log("CompanyName", companyName);
log("Item No", itemNo);
log("Bulk", bulk);
log("Supplier", supplier);
log("warning", warning);
Console.ReadLine();
}
}
}
将输出
ItemName => BFFPPB10黑巧克力澳洲坚果11盎司(312g)
成分=>成分:深色巧克力(糖,巧克力,可可 黄油,无水牛奶脂肪,大豆卵磷脂,香草),夏威夷果, 海盐。
contanins =>包含:夏威夷果,牛奶,大豆。
过敏原信息=>过敏原信息:可能包含其他树 坚果,花生,鸡蛋和小麦。
年份=> 2019年1月11日
描述=>描述:干烤,盐焗澳洲坚果 黑巧克力。
储存条件=>储存条件:在室温下储存 温度低于50%。
保质期=>保质期:12个月
CompanyName =>布莱恩的农场和舰队
货号=>货号:701772
批量=>批量:421172
供应商=>供应商:德文郡的
警告=>
答案 1 :(得分:0)
如果在大量应用程序中使用正则表达式可能会引起性能问题。以下是您可以使用的一种可能的正则表达式。解析产品线和“公司名称”行有点困难,因为尚不清楚产品代码是否具有模式并且公司名称线不像其他字段一样具有“:”,因此正则表达式在这些方面有些“骇客”:
using System;
using System.Text.RegularExpressions;
using System.Linq;
namespace so20190113_01 {
class Program {
static void Main(string[] args) {
string text =
@"BFFPPB14 Dark Chocolate Dried Cherries 14 oz (397g)
INGREDIENTS: DARK CHOCOLATE (SUGAR, CHOCOLATE LIQUOR, COCOA BUTTER, ANHYDROUS MILK FAT, SOYA LECITHIN, VANILLIN [AN ARTIFICIAL FLAVOR]), DRIED TART CHERRIES (CHERRIES, SUGAR), GUM ARABIC, CONFECTIONER'S GLAZE.
CONTAINS: MILK, SOY
ALLERGEN INFORMATION: MAY CONTAIN TREE NUTS, PEANUTS, EGG AND WHEAT.
01/11/2019
Description: Sweetened dried Montmorency cherries that are panned with dark chocolate.
Storage Conditions: Store at ambient temperatures with a humidity less than 50%. Shelf Life: 9 months
Company Name
Item No.: 701804
Bulk: 415265
Supplier: Cherryland's Best
WARNING: CHERRIES MAY CONTAIN PITS";
string pat =
@"^\s*(?<product>\w+\s+\w+\s+\w*[^:]+)$
|^ingredients:\s*(?<ingredients>.*)$
|^contains:\s*(?<contains>.*)$
|^allergen\s+information:\s*(?<allergen>.*)$
|^(?<date>(\d{1,2}/\d{1,2}/\d{2,4}))$
|^description:\s*(?<description>.*)$
|^storage\sconditions:\s*(?<storage>.*)$
|^shelf\slife:\s*(?<shelf>.*)$
|^company\sname\s*(?<company>.*)$
|^item\sno\.:\s*(?<item>.*)$
|^bulk:\s*(?<bulk>.*)$
|^supplier:\s*(?<supplier>.*)$
|^warning:\s*(?<warning>.*)$
";
Regex r = new Regex(pat, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
// Match the regular expression pattern against a text string.
Match m = r.Match(text); // you might want to use the overload that supports a timeout value
Console.WriteLine("Start---");
while (m.Success) {
foreach (Group g in m.Groups.Where(x => x.Success)) {
switch (g.Name) {
case "product":
Console.WriteLine($"Product({g.Success}): '{g.Value.Trim()}'");
break;
case "ingredients":
Console.WriteLine($"Ingredients({g.Success}): '{g.Value.Trim()}'");
break;
// etc.
}
}
m = m.NextMatch();
}
Console.WriteLine("End---");
}
}
}
答案 2 :(得分:0)
我认为解析器是唯一的方法。最初,我尝试使用此正则表达式:
^([\w \.]+?):([\s\S]+?)(?=((^[\w \.]+?):))
其中的关键部分是前瞻?=
,它使字符串可以匹配标签之间的所有文本。但是,它不适用于最后一个订单项,因为它不在另一个标签之前,并且我找不到在不存在的模式下停止匹配的正则表达式。如果该正则表达式存在,则可以在一行代码中完成所有操作:
KeyValuePair<string, string>[] kvs = null;
//one line of code if the look-ahead would also consider non-existent matches
kvs = Regex.Matches(text, @"^([\w \.]+?):([\s\S]+?)(?=((^[\w \.]+?):))", RegexOptions.Multiline)
.Cast<Match>()
.Select(x => new KeyValuePair<string, string>(x.Groups[1].Value, x.Groups[2].Value.Trim(' ', '\r', '\n', '\t')))
.ToArray();
这段代码做得很好。另外,文档的格式不一致,因为Company Name
不在冒号之前。这是唯一可行的锚模式,因为各行被新行打断。
KeyValuePair<string, string>[] kvs = null;
//Otherwise, you have to write a parser
//get all start indexes of labels
var matches = Regex.Matches(text, @"^.+?:", RegexOptions.Multiline).Cast<Match>().ToArray();
kvs = new KeyValuePair<string, string>[matches.Length];
KeyValuePair<string, string> GetKeyValuePair(Match match1, int match1EndIndex)
{
//get the label
var label = text.Substring(match1.Index, match1.Value.Length - 1);
//get the desc and trim white space
var descStart = match1.Index + match1.Value.Length + 1;
var desc = text
.Substring(descStart, match1EndIndex - descStart)
.Trim(' ', '\r', '\n', '\t');
return new KeyValuePair<string, string>(label, desc);
}
for (int i = 0; i < matches.Length - 1; i++)
{
kvs[i] = GetKeyValuePair(matches[i], matches[i + 1].Index);
}
kvs[kvs.Length - 1] = GetKeyValuePair(matches[matches.Length - 1], text.Length);
foreach (var kv in kvs)
{
Console.WriteLine($"{kv.Key}: {kv.Value}");
}