我有一个包含非结构化文本的数据框。在这个可重复的例子中,我正在从SEC网站直接下载一家10K公司的文件并加载read.table。
dir = getwd(); setwd(dir)
download.file("https://www.sec.gov/Archives/edgar/data/2648/0000002648-96-000013.txt", file.path(dir,"filing.txt"))
filing <- read.table(file=file.path(dir, "filing.txt"), sep="\t", quote="", comment.char="")
droplevels.data.frame(filing)
我想删除SEC标题,以便专注于文档的主体(从第216行开始)并将我的文本划分为部分/项目。
> filing$V1[216:218]
[1] PART I
[2] Item 1. Business.
[3] A. Organization of Business
因此,我正在尝试匹配以单词Item(或ITEM)开头的字符串,后跟一个或多个空格,一个或两个数字,一个点,一个或多个空格和一个或多个单词。例如:
Item 1. Business.
ITEM 1. BUSINESS
Item 1. Business
Item 10. Directors and Executive Officers of
ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF THE REGISTRANT
我的尝试涉及str_detect和regex,以便创建一个每次有字符串匹配时跳转的变量计数。
library(dplyr)
library(stringr)
tidy_filing <- filing %>% mutate(count = cumsum(str_detect(V1, regex("^Item [\\d]{1,2}\\.",ignore_case = TRUE)))) %>% ungroup()
但是,我错过了前9项,我的计数仅从第10项开始。
tidy_filing[c(217, 218,251:254),]
V1 count
217 Item 1. Business. 0
218 A. Organization of Business 3 0
251 PART III 0
252 Item 10. Directors etc. 38 1
253 Item 11. Executive Compens. 38 2
254 Item 12. Security Ownership. 38 3
任何帮助都将受到高度赞赏。
答案 0 :(得分:1)
问题是单个数字项目有两个空格,以便与两位数字对齐。您可以通过将正则表达式字符串更改为
来解决此问题import random
import tkinter
from tkinter import *
from tkinter import messagebox
from PIL import ImageTk, Image
NONE=0
TAGGING=1
MAKECANVAS=4
TAGS=["some text","some more text"]
PICS=["c:/users/rob/desktop/camera.jpg","c:/users/rob/desktop/fridge.jpg"]
class Window(Frame):
def __init__(self,master):
Frame.__init__(self, master)
self.master=master
self.mode=NONE
self.init_window()
self.start_tagging()
def start_tagging(self):
if self.photo is not None:
self.photo.destroy()
self.photo=None
messagebox.showinfo("Start tagging")
self.mode=TAGGING
self.configure_buttons()
self.show_pic()
def init_window(self):
menubar=Menu(self.master)
menu=Menu(menubar,tearoff=0)
menu.add_command(label="Start tagging",command=self.start_tagging)
menu.add_command(label="Make canvas",command=self.start_make_canvas)
menubar.add_cascade(label="Tag",menu=menu)
self.master.config(menu=menubar)
self.pack(fill=BOTH,expand=1) #take full space of root window
self.photo=None
self.tag_trk={}
row=1
for tag in TAGS:
self.tag_trk[tag]=IntVar()
Checkbutton(self,text=tag,variable=self.tag_trk[tag]).place(x=500,y=10+20*row)
row+=1
self.tag_count=StringVar()
self.button1_label=StringVar()
self.btn1=Button(self,textvariable=self.button1_label,command=self.button1_click)
self.btn1.place(x=10,y=495)
self.max_score=StringVar()
def configure_buttons(self):
if self.mode==NONE:
self.button1_label.set("Tag")
elif self.mode==TAGGING:
self.button1_label.set("Next")
elif self.mode==MAKECANVAS:
self.button1_label.set("Make")
def button1_click(self):
if self.mode==TAGGING:
self.show_pic()
elif self.mode==MAKECANVAS:
# do some things here
for e in self.form: e.destroy()
self.mode=NONE
self.configure_buttons()
elif self.mode==NONE:
self.start_tagging()
def show_pic(self):
if self.photo is not None:
self.photo.destroy()
img=ImageTk.PhotoImage(Image.open(random.choice(PICS)))
self.photo=tkinter.Label(self,image=img,borderwidth=0)
self.photo.image=img
self.photo.place(x=15,y=5)
def start_make_canvas(self):
if self.photo is not None:
self.photo.destroy()
self.photo=None
self.mode=MAKECANVAS
self.form=[]
e=Label(self,text='Max score')
e.place(x=80,y=200)
self.form.append(e)
e=Entry(self,textvariable=self.max_score,width=20)
e.place(x=180,y=200)
self.form.append(e)
self.form[1].focus_set()
self.configure_buttons()
def target_tags():
global root
root=tkinter.Tk()
root.geometry("700x570")
root.protocol("WM_DELETE_WINDOW", on_closing)
app=Window(root)
root.mainloop()
def on_closing():
global root
root.destroy()
if __name__ == "__main__":
target_tags()