正则表达式,用于读取<title>和</title>之间的字符串

时间:2010-03-30 14:36:49

标签: iphone

我希望阅读html字符串之间的内容。

我认为它应该在objective-c

@"<title([\\s\\S]*)</title>"
下面的

是为正则表达式重写的代码

//source of NSStringCategory.h
#import <Foundation/Foundation.h>
#import <regex.h>


@interface NSStringCategory:NSObject
{
    regex_t preg;
}

-(id)initWithPattern:(NSString *)pattern options:(int)options;
-(void)dealloc;

-(BOOL)matchesString:(NSString *)string;
-(NSString *)matchedSubstringOfString:(NSString *)string;
-(NSArray *)capturedSubstringsOfString:(NSString *)string;

+(NSStringCategory *)regexWithPattern:(NSString *)pattern options:(int)options;
+(NSStringCategory *)regexWithPattern:(NSString *)pattern;

+(NSString *)null;

+(void)initialize;

@end


@interface NSString (NSStringCategory)


-(BOOL)matchedByPattern:(NSString *)pattern options:(int)options;

-(BOOL)matchedByPattern:(NSString *)pattern;

-(NSString *)substringMatchedByPattern:(NSString *)pattern options:(int)options;


-(NSString *)substringMatchedByPattern:(NSString *)pattern;


-(NSArray *)substringsCapturedByPattern:(NSString *)pattern options:(int)options;


-(NSArray *)substringsCapturedByPattern:(NSString *)pattern;


-(NSString *)escapedPattern;

@end

和.m文件

 #import "NSStringCategory.h"
static NSString *nullstring=nil;

@implementation NSStringCategory

-(id)initWithPattern:(NSString *)pattern options:(int)options
{
    if(self=[super init])
    {
        int err=regcomp(&preg,[pattern UTF8String],options|REG_EXTENDED);
        if(err)
        {
            char errbuf[256];
            regerror(err,&preg,errbuf,sizeof(errbuf));
            [NSException raise:@"CSRegexException"
                        format:@"Could not compile regex \"%@\": %s",pattern,errbuf];
        }
    }
    return self;
}

-(void)dealloc
{
    regfree(&preg);
    [super dealloc];
}

-(BOOL)matchesString:(NSString *)string
{
    if(regexec(&preg,[string UTF8String],0,NULL,0)==0) return YES;
    return NO;
}

-(NSString *)matchedSubstringOfString:(NSString *)string
{
    const char *cstr=[string UTF8String];
    regmatch_t match;
    if(regexec(&preg,cstr,1,&match,0)==0)
    {
        return [[[NSString alloc] initWithBytes:cstr+match.rm_so
                                         length:match.rm_eo-match.rm_so encoding:NSUTF8StringEncoding] autorelease];
    }

    return nil;
}

-(NSArray *)capturedSubstringsOfString:(NSString *)string
{
    const char *cstr=[string UTF8String];
    int num=preg.re_nsub+1;
    regmatch_t *matches=calloc(sizeof(regmatch_t),num);

    if(regexec(&preg,cstr,num,matches,0)==0)
    {
        NSMutableArray *array=[NSMutableArray arrayWithCapacity:num];

        int i;
        for(i=0;i<num;i++)
        {
            NSString *str;

            if(matches[i].rm_so==-1&&matches[i].rm_eo==-1) str=nullstring;
            else str=[[[NSString alloc] initWithBytes:cstr+matches[i].rm_so
                                               length:matches[i].rm_eo-matches[i].rm_so encoding:NSUTF8StringEncoding] autorelease];

            [array addObject:str];
        }

        free(matches);

        return [NSArray arrayWithArray:array];
    }

    free(matches);

    return nil;
}

+(NSStringCategory *)regexWithPattern:(NSString *)pattern options:(int)options
{ return [[[NSStringCategory alloc] initWithPattern:pattern options:options] autorelease]; }

+(NSStringCategory *)regexWithPattern:(NSString *)pattern
{ return [[[NSStringCategory alloc] initWithPattern:pattern options:0] autorelease]; }

+(NSString *)null { return nullstring; }

+(void)initialize
{
    if(!nullstring) nullstring=[[NSString alloc] initWithString:@""];
}

@end

@implementation NSString (NSStringCategory)

-(BOOL)matchedByPattern:(NSString *)pattern options:(int)options
{
    NSStringCategory *re=[NSStringCategory regexWithPattern:pattern options:options|REG_NOSUB];
    return [re matchesString:self];
}

-(BOOL)matchedByPattern:(NSString *)pattern
{ return [self matchedByPattern:pattern options:0]; }

-(NSString *)substringMatchedByPattern:(NSString *)pattern options:(int)options
{
    NSStringCategory *re=[NSStringCategory regexWithPattern:pattern options:options];
    return [re matchedSubstringOfString:self];
}

-(NSString *)substringMatchedByPattern:(NSString *)pattern
{ return [self substringMatchedByPattern:pattern options:0]; }

-(NSArray *)substringsCapturedByPattern:(NSString *)pattern options:(int)options
{
    NSStringCategory *re=[NSStringCategory regexWithPattern:pattern options:options];
    return [re capturedSubstringsOfString:self];
}

-(NSArray *)substringsCapturedByPattern:(NSString *)pattern
{ return [self substringsCapturedByPattern:pattern options:0]; }

-(NSString *)escapedPattern
{
    int len=[self length];
    NSMutableString *escaped=[NSMutableString stringWithCapacity:len];

    for(int i=0;i<len;i++)
    {
        unichar c=[self characterAtIndex:i];
        if(c=='^'||c=='.'||c=='['||c=='$'||c=='('||c==')'
           ||c=='|'||c=='*'||c=='+'||c=='?'||c=='{'||c=='\\') [escaped appendFormat:@"\\%C",c];
        else [escaped appendFormat:@"%C",c];
    }
    return [NSString stringWithString:escaped];
}



@end

我使用下面的代码获取“”和“”

之间的字符串
NSStringCategory *a=[[NSStringCategory alloc] initWithPattern:@"<title([\s\S]*)</title>" options:0];//

不幸的是[matchSubstringOfString:response]]总是返回nil

如果正则表达式错误或任何其他原因,我不会。

欢迎任何评论

由于

InterDev中

3 个答案:

答案 0 :(得分:3)

(初步警告:you can't parse HTML correctly with Regex。)


您正在使用regex.h,它提供POSIX正则表达式(在您的情况下为ERE)。它们不支持所有PCRE语法,例如\s\S(并且[\s\S]无论如何都是无用的 - 它匹配任何)。

可能你应该使用

initWithPattern:@"<title[^>]*>([^<]*)</title>" options:REG_ICASE

答案 1 :(得分:1)

<title[^>]*>\([^<]*\)</title>应该可以解决问题。

答案 2 :(得分:0)

对于这种特定情况,我可能会尝试从/ System / Library / Frameworks / WebKit框架中实例化WebDocumentRepresentation对象。

您可以将WebDocumentRepresentation对象的数据源设置为您感兴趣的HTML页面,然后使用对象的 title 方法返回标题。

这是对象上的 Mac OSX Reference Library document