我有一个如下所示的字符串
select split(regexp_extract(coalesce('ITheme:Sports,Genre:SportingEvent,Genre:Sports,Genre:Football,Genre:Pro,ITheme:Football'), '(Genre:.[^,]+)', 0),':')[1]
如果我使用以下查询,它只会返回genre的第一个匹配,即SportingEvents
Genre
我希望输出在单独的列中,如
SportingEvent,Sports, Football
def averageSentence(text):
sents = inaugural.sents(fileids=['fileid_here.txt']
avg = sum(len(word) for word in sents) / len(sents)
print(avg)
答案 0 :(得分:0)
以上是上述问题的答案,但对于大型数据集,需要花费大量时间
select collect_set(myCol2)
from
(
select myCol1,regexp_extract(myCol1,'Genre:(.*)',1) as myCol2
from
(
select split('ITheme:Sports,Genre:SportingEvent,Genre:Sports,Genre:Football,Genre:Pro,ITheme:Football',',') as a
) v1
LATERAL VIEW explode(v1.a) myTable1 AS myCol1
) v2
where myCol2 != ''
答案 1 :(得分:0)
这也有效
select
regexp_replace(
regexp_replace(
regexp_replace(
regexp_replace('ITheme:Sports,Genre:SportingEvent,Genre:Sports,Genre:Football,Genre:Pro,ITheme:Football','((\\w*)(?<!ITheme):.[^,]*)','')
,'(^,)|(,$)',''
)
, ',{2,}', ','
)
,
'Genre:',''
) as Genre