我正在尝试使用SparkR / SparklyR解析NASA-HTTP日志。我无法使其正常工作。
NASA-HTTP日志如下
ix-stp-fl2-19.ix.netcom.com - - [03/Aug/1995:23:03:09 -0400] "GET /images/faq.gif HTTP/1.0" 200 263
slip183-1.kw.jp.ibm.net - - [04/Aug/1995:18:42:17 -0400] "GET /shuttle/missions/sts-70/images/DSC-95EC-0001.gif HTTP/1.0" 200 107133
piweba4y.prodigy.com - - [05/Aug/1995:19:17:41 -0400] "GET /icons/menu.xbm HTTP/1.0" 200 527
我可以使用regexp_extract使用Pyspark做到这一点,如下所示:
split_df = base_df.select(regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
regexp_extract('value', r'^.*\[(\d\d\/\w{3}\/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]', 1).alias('timestamp'),
....
我正在尝试使用SparkR / Sparkly R进行此操作,但是没有取得任何进展
# Initiate a SparkR session
sparkR.session()
sc <- sparkR.init()
sqlContext <- sparkRSQL.init(sc)
df <- read.text(sqlContext, "/FileStore/tables/NASA_access_log*.gz")
尝试了几件事,但没有用
sparkR.session()
sc <- sparkR.init()
sqlContext <- sparkRSQL.init(sc)
df <- read.text(sqlContext, "/FileStore/tables/NASA_access_log*.gz")
dim(df)
df %>% select(df,regexp_extract('\\\\S'),1)
请让我知道如何在数据框上使用正则表达式。
谢谢 Ganesh
答案 0 :(得分:0)
在互联网上闲逛一天半后,我可以解析SparkR和sparklyR中的日志
# Initiate a SparkR session
sparkR.session()
sc <- sparkR.session()
sqlContext <- sparkRSQL.init(sc)
df <- read.text(sqlContext,"/FileStore/tables/NASA_access_log*.gz")
a=df %>%
withColumn('regex1', regexp_extract(df$value, '^(\\S+)', 1)) %>%
withColumn('regex2', regexp_extract(df$value, "((\\S+ -\\d{4}))", 2)) %>%
withColumn('regex3', regexp_extract(df$value, '(\\"\\w+\\s+([^\\s]+)\\s+HTTP.*")', 2)) %>%
withColumn('regex4', regexp_extract(df$value, '(^.*"\\s+([^\\s]+))', 2)) %>%
withColumn('regex5', regexp_extract(df$value, '(^.*\\s+(\\d+)$)', 2))
head(SparkR::collect(a))
regex1 regex2
1 199.72.81.55 [01/Jul/1995:00:00:01 -0400
2 unicomp6.unicomp.net [01/Jul/1995:00:00:06 -0400
3 199.120.110.21 [01/Jul/1995:00:00:09 -0400
4 burger.letters.com [01/Jul/1995:00:00:11 -0400
5 199.120.110.21 [01/Jul/1995:00:00:11 -0400
6 burger.letters.com [01/Jul/1995:00:00:12 -0400
regex3 regex4 regex5
1 /history/apollo/ 200 6245
2 /shuttle/countdown/ 200 3985
3 /shuttle/missions/sts-73/mission-sts-73.html 200 4085
4 /shuttle/countdown/liftoff.html 304 0
5 /shuttle/missions/sts-73/sts-73-patch-small.gif 200 4179
6 /images/NASA-logosmall.gif 304 0
library(sparklyr)
library(dplyr)
library(stringr)
#sc <- spark_connect(master = "local", version = "2.1.0")
sc <- spark_connect(method = "databricks")
sdf <-spark_read_text(sc, name="df", path = "/FileStore/tables/NASA_access_log*.gz")
sdf <- sdf %>% mutate(regex = regexp_extract(line, '^(\\\\S+)',1)) %>%
mutate(regex1 = regexp_extract(line, '((\\\\S+ -\\\\d{4}))',2)) %>%
mutate(regex2 = regexp_extract(line, '(\\\\"\\\\w+\\\\s+([^\\\\s]+)\\\\s+HTTP.*")',2)) %>%
mutate(regex3 = regexp_extract(line, '(^.*"\\\\s+([^\\\\s]+))',2)) %>%
mutate(regex4 = regexp_extract(line, '(^.*\\\\s+(\\\\d+)$)',2))
sdf
line regex regex1 regex2 regex3 regex4
1 "199.72.81.55 - - [01/J… 199.72.8… [01/Jul/19… /history/apollo/ 200 6245
2 "unicomp6.unicomp.net -… unicomp6… [01/Jul/19… /shuttle/countd… 200 3985
3 "199.120.110.21 - - [01… 199.120.… [01/Jul/19… /shuttle/missio… 200 4085
4 "burger.letters.com - -… burger.l… [01/Jul/19… /shuttle/countd… 304 0
5 "199.120.110.21 - - [01… 199.120.… [01/Jul/19… /shuttle/missio… 200 4179
6 "burger.letters.com - -… burger.l… [01/Jul/19… /images/NASA-lo… 304 0
7 "burger.letters.com - -… burger.l… [01/Jul/19… /shuttle/countd… 200 0