1/************************************************* 2* PCRE DEMONSTRATION PROGRAM * 3*************************************************/ 4 5/* This is a demonstration program to illustrate the most straightforward ways 6of calling the PCRE regular expression library from a C program. See the 7pcresample documentation for a short discussion ("man pcresample" if you have 8the PCRE man pages installed). 9 10In Unix-like environments, if PCRE is installed in your standard system 11libraries, you should be able to compile this program using this command: 12 13gcc -Wall pcredemo.c -lpcre -o pcredemo 14 15If PCRE is not installed in a standard place, it is likely to be installed with 16support for the pkg-config mechanism. If you have pkg-config, you can compile 17this program using this command: 18 19gcc -Wall pcredemo.c `pkg-config --cflags --libs libpcre` -o pcredemo 20 21If you do not have pkg-config, you may have to use this: 22 23gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \ 24 -R/usr/local/lib -lpcre -o pcredemo 25 26Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and 27library files for PCRE are installed on your system. Only some operating 28systems (e.g. Solaris) use the -R option. 29 30Building under Windows: 31 32If you want to statically link this program against a non-dll .a file, you must 33define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and 34pcre_free() exported functions will be declared __declspec(dllimport), with 35unwanted results. So in this environment, uncomment the following line. */ 36 37/* #define PCRE_STATIC */ 38 39#include <stdio.h> 40#include <string.h> 41#include <pcre.h> 42 43#define OVECCOUNT 30 /* should be a multiple of 3 */ 44 45 46int main(int argc, char **argv) 47{ 48pcre *re; 49const char *error; 50char *pattern; 51char *subject; 52unsigned char *name_table; 53unsigned int option_bits; 54int erroffset; 55int find_all; 56int crlf_is_newline; 57int namecount; 58int name_entry_size; 59int ovector[OVECCOUNT]; 60int subject_length; 61int rc, i; 62int utf8; 63 64 65/************************************************************************** 66* First, sort out the command line. There is only one possible option at * 67* the moment, "-g" to request repeated matching to find all occurrences, * 68* like Perl's /g option. We set the variable find_all to a non-zero value * 69* if the -g option is present. Apart from that, there must be exactly two * 70* arguments. * 71**************************************************************************/ 72 73find_all = 0; 74for (i = 1; i < argc; i++) 75 { 76 if (strcmp(argv[i], "-g") == 0) find_all = 1; 77 else break; 78 } 79 80/* After the options, we require exactly two arguments, which are the pattern, 81and the subject string. */ 82 83if (argc - i != 2) 84 { 85 printf("Two arguments required: a regex and a subject string\n"); 86 return 1; 87 } 88 89pattern = argv[i]; 90subject = argv[i+1]; 91subject_length = (int)strlen(subject); 92 93 94/************************************************************************* 95* Now we are going to compile the regular expression pattern, and handle * 96* and errors that are detected. * 97*************************************************************************/ 98 99re = pcre_compile( 100 pattern, /* the pattern */ 101 0, /* default options */ 102 &error, /* for error message */ 103 &erroffset, /* for error offset */ 104 NULL); /* use default character tables */ 105 106/* Compilation failed: print the error message and exit */ 107 108if (re == NULL) 109 { 110 printf("PCRE compilation failed at offset %d: %s\n", erroffset, error); 111 return 1; 112 } 113 114 115/************************************************************************* 116* If the compilation succeeded, we call PCRE again, in order to do a * 117* pattern match against the subject string. This does just ONE match. If * 118* further matching is needed, it will be done below. * 119*************************************************************************/ 120 121rc = pcre_exec( 122 re, /* the compiled pattern */ 123 NULL, /* no extra data - we didn't study the pattern */ 124 subject, /* the subject string */ 125 subject_length, /* the length of the subject */ 126 0, /* start at offset 0 in the subject */ 127 0, /* default options */ 128 ovector, /* output vector for substring information */ 129 OVECCOUNT); /* number of elements in the output vector */ 130 131/* Matching failed: handle error cases */ 132 133if (rc < 0) 134 { 135 switch(rc) 136 { 137 case PCRE_ERROR_NOMATCH: printf("No match\n"); break; 138 /* 139 Handle other special cases if you like 140 */ 141 default: printf("Matching error %d\n", rc); break; 142 } 143 pcre_free(re); /* Release memory used for the compiled pattern */ 144 return 1; 145 } 146 147/* Match succeded */ 148 149printf("\nMatch succeeded at offset %d\n", ovector[0]); 150 151 152/************************************************************************* 153* We have found the first match within the subject string. If the output * 154* vector wasn't big enough, say so. Then output any substrings that were * 155* captured. * 156*************************************************************************/ 157 158/* The output vector wasn't big enough */ 159 160if (rc == 0) 161 { 162 rc = OVECCOUNT/3; 163 printf("ovector only has room for %d captured substrings\n", rc - 1); 164 } 165 166/* Show substrings stored in the output vector by number. Obviously, in a real 167application you might want to do things other than print them. */ 168 169for (i = 0; i < rc; i++) 170 { 171 char *substring_start = subject + ovector[2*i]; 172 int substring_length = ovector[2*i+1] - ovector[2*i]; 173 printf("%2d: %.*s\n", i, substring_length, substring_start); 174 } 175 176 177/************************************************************************** 178* That concludes the basic part of this demonstration program. We have * 179* compiled a pattern, and performed a single match. The code that follows * 180* shows first how to access named substrings, and then how to code for * 181* repeated matches on the same subject. * 182**************************************************************************/ 183 184/* See if there are any named substrings, and if so, show them by name. First 185we have to extract the count of named parentheses from the pattern. */ 186 187(void)pcre_fullinfo( 188 re, /* the compiled pattern */ 189 NULL, /* no extra data - we didn't study the pattern */ 190 PCRE_INFO_NAMECOUNT, /* number of named substrings */ 191 &namecount); /* where to put the answer */ 192 193if (namecount <= 0) printf("No named substrings\n"); else 194 { 195 unsigned char *tabptr; 196 printf("Named substrings\n"); 197 198 /* Before we can access the substrings, we must extract the table for 199 translating names to numbers, and the size of each entry in the table. */ 200 201 (void)pcre_fullinfo( 202 re, /* the compiled pattern */ 203 NULL, /* no extra data - we didn't study the pattern */ 204 PCRE_INFO_NAMETABLE, /* address of the table */ 205 &name_table); /* where to put the answer */ 206 207 (void)pcre_fullinfo( 208 re, /* the compiled pattern */ 209 NULL, /* no extra data - we didn't study the pattern */ 210 PCRE_INFO_NAMEENTRYSIZE, /* size of each entry in the table */ 211 &name_entry_size); /* where to put the answer */ 212 213 /* Now we can scan the table and, for each entry, print the number, the name, 214 and the substring itself. */ 215 216 tabptr = name_table; 217 for (i = 0; i < namecount; i++) 218 { 219 int n = (tabptr[0] << 8) | tabptr[1]; 220 printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, 221 ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]); 222 tabptr += name_entry_size; 223 } 224 } 225 226 227/************************************************************************* 228* If the "-g" option was given on the command line, we want to continue * 229* to search for additional matches in the subject string, in a similar * 230* way to the /g option in Perl. This turns out to be trickier than you * 231* might think because of the possibility of matching an empty string. * 232* What happens is as follows: * 233* * 234* If the previous match was NOT for an empty string, we can just start * 235* the next match at the end of the previous one. * 236* * 237* If the previous match WAS for an empty string, we can't do that, as it * 238* would lead to an infinite loop. Instead, a special call of pcre_exec() * 239* is made with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set. * 240* The first of these tells PCRE that an empty string at the start of the * 241* subject is not a valid match; other possibilities must be tried. The * 242* second flag restricts PCRE to one match attempt at the initial string * 243* position. If this match succeeds, an alternative to the empty string * 244* match has been found, and we can print it and proceed round the loop, * 245* advancing by the length of whatever was found. If this match does not * 246* succeed, we still stay in the loop, advancing by just one character. * 247* In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be * 248* more than one byte. * 249* * 250* However, there is a complication concerned with newlines. When the * 251* newline convention is such that CRLF is a valid newline, we must * 252* advance by two characters rather than one. The newline convention can * 253* be set in the regex by (*CR), etc.; if not, we must find the default. * 254*************************************************************************/ 255 256if (!find_all) /* Check for -g */ 257 { 258 pcre_free(re); /* Release the memory used for the compiled pattern */ 259 return 0; /* Finish unless -g was given */ 260 } 261 262/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline 263sequence. First, find the options with which the regex was compiled; extract 264the UTF-8 state, and mask off all but the newline options. */ 265 266(void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits); 267utf8 = option_bits & PCRE_UTF8; 268option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF| 269 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF; 270 271/* If no newline options were set, find the default newline convention from the 272build configuration. */ 273 274if (option_bits == 0) 275 { 276 int d; 277 (void)pcre_config(PCRE_CONFIG_NEWLINE, &d); 278 /* Note that these values are always the ASCII ones, even in 279 EBCDIC environments. CR = 13, NL = 10. */ 280 option_bits = (d == 13)? PCRE_NEWLINE_CR : 281 (d == 10)? PCRE_NEWLINE_LF : 282 (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF : 283 (d == -2)? PCRE_NEWLINE_ANYCRLF : 284 (d == -1)? PCRE_NEWLINE_ANY : 0; 285 } 286 287/* See if CRLF is a valid newline sequence. */ 288 289crlf_is_newline = 290 option_bits == PCRE_NEWLINE_ANY || 291 option_bits == PCRE_NEWLINE_CRLF || 292 option_bits == PCRE_NEWLINE_ANYCRLF; 293 294/* Loop for second and subsequent matches */ 295 296for (;;) 297 { 298 int options = 0; /* Normally no options */ 299 int start_offset = ovector[1]; /* Start at end of previous match */ 300 301 /* If the previous match was for an empty string, we are finished if we are 302 at the end of the subject. Otherwise, arrange to run another match at the 303 same point to see if a non-empty match can be found. */ 304 305 if (ovector[0] == ovector[1]) 306 { 307 if (ovector[0] == subject_length) break; 308 options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED; 309 } 310 311 /* Run the next matching operation */ 312 313 rc = pcre_exec( 314 re, /* the compiled pattern */ 315 NULL, /* no extra data - we didn't study the pattern */ 316 subject, /* the subject string */ 317 subject_length, /* the length of the subject */ 318 start_offset, /* starting offset in the subject */ 319 options, /* options */ 320 ovector, /* output vector for substring information */ 321 OVECCOUNT); /* number of elements in the output vector */ 322 323 /* This time, a result of NOMATCH isn't an error. If the value in "options" 324 is zero, it just means we have found all possible matches, so the loop ends. 325 Otherwise, it means we have failed to find a non-empty-string match at a 326 point where there was a previous empty-string match. In this case, we do what 327 Perl does: advance the matching position by one character, and continue. We 328 do this by setting the "end of previous match" offset, because that is picked 329 up at the top of the loop as the point at which to start again. 330 331 There are two complications: (a) When CRLF is a valid newline sequence, and 332 the current position is just before it, advance by an extra byte. (b) 333 Otherwise we must ensure that we skip an entire UTF-8 character if we are in 334 UTF-8 mode. */ 335 336 if (rc == PCRE_ERROR_NOMATCH) 337 { 338 if (options == 0) break; /* All matches found */ 339 ovector[1] = start_offset + 1; /* Advance one byte */ 340 if (crlf_is_newline && /* If CRLF is newline & */ 341 start_offset < subject_length - 1 && /* we are at CRLF, */ 342 subject[start_offset] == '\r' && 343 subject[start_offset + 1] == '\n') 344 ovector[1] += 1; /* Advance by one more. */ 345 else if (utf8) /* Otherwise, ensure we */ 346 { /* advance a whole UTF-8 */ 347 while (ovector[1] < subject_length) /* character. */ 348 { 349 if ((subject[ovector[1]] & 0xc0) != 0x80) break; 350 ovector[1] += 1; 351 } 352 } 353 continue; /* Go round the loop again */ 354 } 355 356 /* Other matching errors are not recoverable. */ 357 358 if (rc < 0) 359 { 360 printf("Matching error %d\n", rc); 361 pcre_free(re); /* Release memory used for the compiled pattern */ 362 return 1; 363 } 364 365 /* Match succeded */ 366 367 printf("\nMatch succeeded again at offset %d\n", ovector[0]); 368 369 /* The match succeeded, but the output vector wasn't big enough. */ 370 371 if (rc == 0) 372 { 373 rc = OVECCOUNT/3; 374 printf("ovector only has room for %d captured substrings\n", rc - 1); 375 } 376 377 /* As before, show substrings stored in the output vector by number, and then 378 also any named substrings. */ 379 380 for (i = 0; i < rc; i++) 381 { 382 char *substring_start = subject + ovector[2*i]; 383 int substring_length = ovector[2*i+1] - ovector[2*i]; 384 printf("%2d: %.*s\n", i, substring_length, substring_start); 385 } 386 387 if (namecount <= 0) printf("No named substrings\n"); else 388 { 389 unsigned char *tabptr = name_table; 390 printf("Named substrings\n"); 391 for (i = 0; i < namecount; i++) 392 { 393 int n = (tabptr[0] << 8) | tabptr[1]; 394 printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, 395 ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]); 396 tabptr += name_entry_size; 397 } 398 } 399 } /* End of loop to find second and subsequent matches */ 400 401printf("\n"); 402pcre_free(re); /* Release memory used for the compiled pattern */ 403return 0; 404} 405 406/* End of pcredemo.c */ 407