删除特殊引号和其他字符

时间:2017-06-28 13:50:31

标签: python nltk python-newspaper

我正在尝试使用jun 27, 2017 10:12:13 PM org.apache.catalina.startup.ContextConfig processAnnotationsJar SEVERE: Unable to process Jar entry [module-info.class] from Jar [jar:file:/D:/projectx/projectxFull/Apache%20Software%20Foundation/Tomcat%207.0/webapps/standard/WEB-INF/lib/javax.json-1.1.jar!/] for annotations org.apache.tomcat.util.bcel.classfile.ClassFormatException: Invalid byte tag in constant pool: 19 at org.apache.tomcat.util.bcel.classfile.Constant.readConstant(Constant.java:133) at org.apache.tomcat.util.bcel.classfile.ConstantPool.<init>(ConstantPool.java:60) at org.apache.tomcat.util.bcel.classfile.ClassParser.readConstantPool(ClassParser.java:209) at org.apache.tomcat.util.bcel.classfile.ClassParser.parse(ClassParser.java:119) at org.apache.catalina.startup.ContextConfig.processAnnotationsStream(ContextConfig.java:2134) at org.apache.catalina.startup.ContextConfig.processAnnotationsJar(ContextConfig.java:2010) at org.apache.catalina.startup.ContextConfig.processAnnotationsUrl(ContextConfig.java:1976) at org.apache.catalina.startup.ContextConfig.processAnnotations(ContextConfig.java:1961) at org.apache.catalina.startup.ContextConfig.webConfig(ContextConfig.java:1319) at org.apache.catalina.startup.ContextConfig.configureStart(ContextConfig.java:878) at org.apache.catalina.startup.ContextConfig.lifecycleEvent(ContextConfig.java:376) at org.apache.catalina.util.LifecycleSupport.fireLifecycleEvent(LifecycleSupport.java:119) at org.apache.catalina.util.LifecycleBase.fireLifecycleEvent(LifecycleBase.java:90) at org.apache.catalina.core.StandardContext.startInternal(StandardContext.java:5322) at org.apache.catalina.util.LifecycleBase.start(LifecycleBase.java:150) at org.apache.catalina.core.ContainerBase.addChildInternal(ContainerBase.java:901) at org.apache.catalina.core.ContainerBase.addChild(ContainerBase.java:877) at org.apache.catalina.core.StandardHost.addChild(StandardHost.java:633) at org.apache.catalina.startup.HostConfig.deployWAR(HostConfig.java:983) at org.apache.catalina.startup.HostConfig$DeployWar.run(HostConfig.java:1660) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://example.com/custom_tags is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jstl/core_rt is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jstl/core is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jsp/jstl/core is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jstl/fmt_rt is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jstl/fmt is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jsp/jstl/fmt is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jsp/jstl/functions is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://jakarta.apache.org/taglibs/standard/permittedTaglibs is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://jakarta.apache.org/taglibs/standard/scriptfree is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jstl/sql_rt is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jstl/sql is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jsp/jstl/sql is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jstl/xml_rt is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jstl/xml is already defined jun 27, 2017 10:12:14 PM org.apache.catalina.startup.TaglibUriRule body INFO: TLD skipped. URI: http://java.sun.com/jsp/jstl/xml is already defined SLF4J: Class path contains multiple SLF4J bindings. SLF4J: Found binding in [jar:file:/D:/ProjectX/projectxFull/Apache%20Software%20Foundation/Tomcat%207.0/webapps/standard/WEB-INF/lib/com.springsource.slf4j.log4j-1.6.1.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: Found binding in [jar:file:/D:/ProjectX/projectxFull/Apache%20Software%20Foundation/Tomcat%207.0/webapps/standard/WEB-INF/lib/logback-classic-1.2.3.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. jun 27, 2017 10:12:14 PM org.apache.catalina.core.ApplicationContext log INFO: No Spring WebApplicationInitializer types detected on classpath log4j:WARN Error during default initialization java.lang.NoClassDefFoundError: com/sun/mail/util/MailLogger at javax.mail.Session.initLogger(Session.java:230) at javax.mail.Session.<init>(Session.java:214) at javax.mail.Session.getInstance(Session.java:251) at org.apache.log4j.net.SMTPAppender.createSession(SMTPAppender.java:225) at org.apache.log4j.net.SMTPAppender.activateOptions(SMTPAppender.java:137) at org.apache.log4j.config.PropertySetter.activate(PropertySetter.java:307) at org.apache.log4j.config.PropertySetter.setProperties(PropertySetter.java:172) at org.apache.log4j.config.PropertySetter.setProperties(PropertySetter.java:104) at org.apache.log4j.PropertyConfigurator.parseAppender(PropertyConfigurator.java:809) at org.apache.log4j.PropertyConfigurator.parseCategory(PropertyConfigurator.java:735) at org.apache.log4j.PropertyConfigurator.configureRootCategory(PropertyConfigurator.java:615) at org.apache.log4j.PropertyConfigurator.doConfigure(PropertyConfigurator.java:502) at org.apache.log4j.PropertyConfigurator.doConfigure(PropertyConfigurator.java:547) at org.apache.log4j.helpers.OptionConverter.selectAndConfigure(OptionConverter.java:483) at org.apache.log4j.LogManager.<clinit>(LogManager.java:127) at org.slf4j.impl.Log4jLoggerFactory.getLogger(Log4jLoggerFactory.java:73) at org.slf4j.LoggerFactory.getLogger(LoggerFactory.java:242) at org.apache.commons.logging.impl.SLF4JLogFactory.getInstance(SLF4JLogFactory.java:156) at org.apache.commons.logging.impl.SLF4JLogFactory.getInstance(SLF4JLogFactory.java:132) at org.apache.commons.logging.LogFactory.getLog(LogFactory.java:272) at org.springframework.web.context.ContextLoader.initWebApplicationContext(ContextLoader.java:301) at org.springframework.web.context.ContextLoaderListener.contextInitialized(ContextLoaderListener.java:107) at org.apache.catalina.core.StandardContext.listenerStart(StandardContext.java:4939) at org.apache.catalina.core.StandardContext.startInternal(StandardContext.java:5434) at org.apache.catalina.util.LifecycleBase.start(LifecycleBase.java:150) at org.apache.catalina.core.ContainerBase.addChildInternal(ContainerBase.java:901) at org.apache.catalina.core.ContainerBase.addChild(ContainerBase.java:877) at org.apache.catalina.core.StandardHost.addChild(StandardHost.java:633) at org.apache.catalina.startup.HostConfig.deployWAR(HostConfig.java:983) at org.apache.catalina.startup.HostConfig$DeployWar.run(HostConfig.java:1660) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Caused by: java.lang.ClassNotFoundException: com.sun.mail.util.MailLogger at org.apache.catalina.loader.WebappClassLoader.loadClass(WebappClassLoader.java:1702) at org.apache.catalina.loader.WebappClassLoader.loadClass(WebappClassLoader.java:1547) ... 35 more jun 27, 2017 10:12:15 PM org.apache.catalina.core.ApplicationContext log INFO: Initializing Spring root WebApplicationContext jun 27, 2017 10:12:15 PM org.apache.catalina.core.StandardContext listenerStart SEVERE: Exception sending context initialized event to listener instance of class org.springframework.web.context.ContextLoaderListener org.springframework.beans.factory.parsing.BeanDefinitionParsingException: Configuration problem: You cannot use a spring-security-2.0.xsd or spring-security-3.0.xsd or spring-security-3.1.xsd schema or spring-security-3.2.xsd schema or spring-security-4.0.xsd schema with Spring Security 4.2. Please update your schema declarations to the 4.2 schema. Offending resource: ServletContext resource [/WEB-INF/spring-security.xml] at org.springframework.beans.factory.parsing.FailFastProblemReporter.fatal(FailFastProblemReporter.java:60) at org.springframework.beans.factory.parsing.ReaderContext.fatal(ReaderContext.java:68) at org.springframework.beans.factory.parsing.ReaderContext.fatal(ReaderContext.java:55) at org.springframework.security.config.SecurityNamespaceHandler.parse(SecurityNamespaceHandler.java:88) at org.springframework.beans.factory.xml.BeanDefinitionParserDelegate.parseCustomElement(BeanDefinitionParserDelegate.java:1411) at org.springframework.beans.factory.xml.BeanDefinitionParserDelegate.parseCustomElement(BeanDefinitionParserDelegate.java:1401) at org.springframework.beans.factory.xml.DefaultBeanDefinitionDocumentReader.parseBeanDefinitions(DefaultBeanDefinitionDocumentReader.java:172) at org.springframework.beans.factory.xml.DefaultBeanDefinitionDocumentReader.doRegisterBeanDefinitions(DefaultBeanDefinitionDocumentReader.java:142) at org.springframework.beans.factory.xml.DefaultBeanDefinitionDocumentReader.registerBeanDefinitions(DefaultBeanDefinitionDocumentReader.java:94) at org.springframework.beans.factory.xml.XmlBeanDefinitionReader.registerBeanDefinitions(XmlBeanDefinitionReader.java:508) at org.springframework.beans.factory.xml.XmlBeanDefinitionReader.doLoadBeanDefinitions(XmlBeanDefinitionReader.java:392) at org.springframework.beans.factory.xml.XmlBeanDefinitionReader.loadBeanDefinitions(XmlBeanDefinitionReader.java:336) at org.springframework.beans.factory.xml.XmlBeanDefinitionReader.loadBeanDefinitions(XmlBeanDefinitionReader.java:304) at org.springframework.beans.factory.support.AbstractBeanDefinitionReader.loadBeanDefinitions(AbstractBeanDefinitionReader.java:181) at org.springframework.beans.factory.support.AbstractBeanDefinitionReader.loadBeanDefinitions(AbstractBeanDefinitionReader.java:217) at org.springframework.beans.factory.support.AbstractBeanDefinitionReader.loadBeanDefinitions(AbstractBeanDefinitionReader.java:188) at org.springframework.web.context.support.XmlWebApplicationContext.loadBeanDefinitions(XmlWebApplicationContext.java:125) at org.springframework.web.context.support.XmlWebApplicationContext.loadBeanDefinitions(XmlWebApplicationContext.java:94) at org.springframework.context.support.AbstractRefreshableApplicationContext.refreshBeanFactory(AbstractRefreshableApplicationContext.java:129) at org.springframework.context.support.AbstractApplicationContext.obtainFreshBeanFactory(AbstractApplicationContext.java:614) at org.springframework.context.support.AbstractApplicationContext.refresh(AbstractApplicationContext.java:515) at org.springframework.web.context.ContextLoader.configureAndRefreshWebApplicationContext(ContextLoader.java:443) at org.springframework.web.context.ContextLoader.initWebApplicationContext(ContextLoader.java:325) at org.springframework.web.context.ContextLoaderListener.contextInitialized(ContextLoaderListener.java:107) at org.apache.catalina.core.StandardContext.listenerStart(StandardContext.java:4939) at org.apache.catalina.core.StandardContext.startInternal(StandardContext.java:5434) at org.apache.catalina.util.LifecycleBase.start(LifecycleBase.java:150) at org.apache.catalina.core.ContainerBase.addChildInternal(ContainerBase.java:901) at org.apache.catalina.core.ContainerBase.addChild(ContainerBase.java:877) at org.apache.catalina.core.StandardHost.addChild(StandardHost.java:633) at org.apache.catalina.startup.HostConfig.deployWAR(HostConfig.java:983) at org.apache.catalina.startup.HostConfig$DeployWar.run(HostConfig.java:1660) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) jun 27, 2017 10:12:15 PM org.apache.catalina.core.StandardContext startInternal SEVERE: Error listenerStart jun 27, 2017 10:12:15 PM org.apache.catalina.core.StandardContext startInternal SEVERE: Context [/standard] startup failed due to previous errors jun 27, 2017 10:12:15 PM org.apache.catalina.core.ApplicationContext log INFO: Closing Spring root WebApplicationContext jun 27, 2017 10:12:15 PM org.apache.catalina.core.StandardContext listenerStop SEVERE: Exception sending context destroyed event to listener instance of class org.springframework.web.context.ContextLoaderListener java.lang.IllegalStateException: BeanFactory not initialized or already closed - call 'refresh' before accessing beans via the ApplicationContext at org.springframework.context.support.AbstractRefreshableApplicationContext.getBeanFactory(AbstractRefreshableApplicationContext.java:170) at org.springframework.context.support.AbstractApplicationContext.destroyBeans(AbstractApplicationContext.java:1030) at org.springframework.context.support.AbstractApplicationContext.doClose(AbstractApplicationContext.java:1006) at org.springframework.context.support.AbstractApplicationContext.close(AbstractApplicationContext.java:958) at org.springframework.web.context.ContextLoader.closeWebApplicationContext(ContextLoader.java:583) at org.springframework.web.context.ContextLoaderListener.contextDestroyed(ContextLoaderListener.java:116) at org.apache.catalina.core.StandardContext.listenerStop(StandardContext.java:4980) at org.apache.catalina.core.StandardContext.stopInternal(StandardContext.java:5626) at org.apache.catalina.util.LifecycleBase.stop(LifecycleBase.java:232) at org.apache.catalina.util.LifecycleBase.start(LifecycleBase.java:160) at org.apache.catalina.core.ContainerBase.addChildInternal(ContainerBase.java:901) at org.apache.catalina.core.ContainerBase.addChild(ContainerBase.java:877) at org.apache.catalina.core.StandardHost.addChild(StandardHost.java:633) at org.apache.catalina.startup.HostConfig.deployWAR(HostConfig.java:983) at org.apache.catalina.startup.HostConfig$DeployWar.run(HostConfig.java:1660) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) 中的Article下载文章,并尝试使用newspaper nltk对单词进行标记。问题是,当我尝试打印已解析的文章文本时,其中一些文章有word_tokenizer等特殊引号,这些引号未被标记符过滤掉,就像常规'

一样

有没有办法用普通引号替换这些特殊引号,或者更好地删除令牌化程序可能遗漏的所有可能的特殊字符?

我尝试通过在代码中明确提及它们来删除这些特殊字符,但它给出了错误"

1 个答案:

答案 0 :(得分:1)

使用unidecode包通常会用utf-8替换这些字符。

from unidecode import unidecode
text = unidecode(text)

然而,缺点是您还可以更改一些您可能想要保留的字符(例如强调的字符)。如果是这种情况,可以选择使用regular expressions来专门删除(或替换)某些预先识别的特殊字符:

import re
exotic_quotes = ['\\x92'] # fill this up
text = re.sub(exotic_quotes, "'", text) # changing the second argument to fill the kind of quote you want to replace the exotic ones with

我希望这有帮助!