kallsyms.c revision 41f11a4fa378201e902892130b11d78cf7cf8e10
1/* Generate assembler source containing symbol information 2 * 3 * Copyright 2002 by Kai Germaschewski 4 * 5 * This software may be used and distributed according to the terms 6 * of the GNU General Public License, incorporated herein by reference. 7 * 8 * Usage: nm -n vmlinux | scripts/kallsyms [--all-symbols] > symbols.S 9 * 10 * ChangeLog: 11 * 12 * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> 13 * Changed the compression method from stem compression to "table lookup" 14 * compression 15 * 16 * Table compression uses all the unused char codes on the symbols and 17 * maps these to the most used substrings (tokens). For instance, it might 18 * map char code 0xF7 to represent "write_" and then in every symbol where 19 * "write_" appears it can be replaced by 0xF7, saving 5 bytes. 20 * The used codes themselves are also placed in the table so that the 21 * decompresion can work without "special cases". 22 * Applied to kernel symbols, this usually produces a compression ratio 23 * of about 50%. 24 * 25 */ 26 27#include <stdio.h> 28#include <stdlib.h> 29#include <string.h> 30#include <ctype.h> 31 32/* maximum token length used. It doesn't pay to increase it a lot, because 33 * very long substrings probably don't repeat themselves too often. */ 34#define MAX_TOK_SIZE 11 35#define KSYM_NAME_LEN 127 36 37/* we use only a subset of the complete symbol table to gather the token count, 38 * to speed up compression, at the expense of a little compression ratio */ 39#define WORKING_SET 1024 40 41/* first find the best token only on the list of tokens that would profit more 42 * than GOOD_BAD_THRESHOLD. Only if this list is empty go to the "bad" list. 43 * Increasing this value will put less tokens on the "good" list, so the search 44 * is faster. However, if the good list runs out of tokens, we must painfully 45 * search the bad list. */ 46#define GOOD_BAD_THRESHOLD 10 47 48/* token hash parameters */ 49#define HASH_BITS 18 50#define HASH_TABLE_SIZE (1 << HASH_BITS) 51#define HASH_MASK (HASH_TABLE_SIZE - 1) 52#define HASH_BASE_OFFSET 2166136261U 53#define HASH_FOLD(a) ((a)&(HASH_MASK)) 54 55/* flags to mark symbols */ 56#define SYM_FLAG_VALID 1 57#define SYM_FLAG_SAMPLED 2 58 59struct sym_entry { 60 unsigned long long addr; 61 char type; 62 unsigned char flags; 63 unsigned char len; 64 unsigned char *sym; 65}; 66 67 68static struct sym_entry *table; 69static int size, cnt; 70static unsigned long long _stext, _etext, _sinittext, _einittext; 71static int all_symbols = 0; 72static char symbol_prefix_char = '\0'; 73 74struct token { 75 unsigned char data[MAX_TOK_SIZE]; 76 unsigned char len; 77 /* profit: the number of bytes that could be saved by inserting this 78 * token into the table */ 79 int profit; 80 struct token *next; /* next token on the hash list */ 81 struct token *right; /* next token on the good/bad list */ 82 struct token *left; /* previous token on the good/bad list */ 83 struct token *smaller; /* token that is less one letter than this one */ 84 }; 85 86struct token bad_head, good_head; 87struct token *hash_table[HASH_TABLE_SIZE]; 88 89/* the table that holds the result of the compression */ 90unsigned char best_table[256][MAX_TOK_SIZE+1]; 91unsigned char best_table_len[256]; 92 93 94static void 95usage(void) 96{ 97 fprintf(stderr, "Usage: kallsyms [--all-symbols] [--symbol-prefix=<prefix char>] < in.map > out.S\n"); 98 exit(1); 99} 100 101/* 102 * This ignores the intensely annoying "mapping symbols" found 103 * in ARM ELF files: $a, $t and $d. 104 */ 105static inline int 106is_arm_mapping_symbol(const char *str) 107{ 108 return str[0] == '$' && strchr("atd", str[1]) 109 && (str[2] == '\0' || str[2] == '.'); 110} 111 112static int 113read_symbol(FILE *in, struct sym_entry *s) 114{ 115 char str[500]; 116 char *sym; 117 int rc; 118 119 rc = fscanf(in, "%llx %c %499s\n", &s->addr, &s->type, str); 120 if (rc != 3) { 121 if (rc != EOF) { 122 /* skip line */ 123 fgets(str, 500, in); 124 } 125 return -1; 126 } 127 128 sym = str; 129 /* skip prefix char */ 130 if (symbol_prefix_char && str[0] == symbol_prefix_char) 131 sym++; 132 133 /* Ignore most absolute/undefined (?) symbols. */ 134 if (strcmp(sym, "_stext") == 0) 135 _stext = s->addr; 136 else if (strcmp(sym, "_etext") == 0) 137 _etext = s->addr; 138 else if (strcmp(sym, "_sinittext") == 0) 139 _sinittext = s->addr; 140 else if (strcmp(sym, "_einittext") == 0) 141 _einittext = s->addr; 142 else if (toupper(s->type) == 'A') 143 { 144 /* Keep these useful absolute symbols */ 145 if (strcmp(sym, "__kernel_syscall_via_break") && 146 strcmp(sym, "__kernel_syscall_via_epc") && 147 strcmp(sym, "__kernel_sigtramp") && 148 strcmp(sym, "__gp")) 149 return -1; 150 151 } 152 else if (toupper(s->type) == 'U' || 153 is_arm_mapping_symbol(sym)) 154 return -1; 155 156 /* include the type field in the symbol name, so that it gets 157 * compressed together */ 158 s->len = strlen(str) + 1; 159 s->sym = (char *) malloc(s->len + 1); 160 strcpy(s->sym + 1, str); 161 s->sym[0] = s->type; 162 163 return 0; 164} 165 166static int 167symbol_valid(struct sym_entry *s) 168{ 169 /* Symbols which vary between passes. Passes 1 and 2 must have 170 * identical symbol lists. The kallsyms_* symbols below are only added 171 * after pass 1, they would be included in pass 2 when --all-symbols is 172 * specified so exclude them to get a stable symbol list. 173 */ 174 static char *special_symbols[] = { 175 "kallsyms_addresses", 176 "kallsyms_num_syms", 177 "kallsyms_names", 178 "kallsyms_markers", 179 "kallsyms_token_table", 180 "kallsyms_token_index", 181 182 /* Exclude linker generated symbols which vary between passes */ 183 "_SDA_BASE_", /* ppc */ 184 "_SDA2_BASE_", /* ppc */ 185 NULL }; 186 int i; 187 int offset = 1; 188 189 /* skip prefix char */ 190 if (symbol_prefix_char && *(s->sym + 1) == symbol_prefix_char) 191 offset++; 192 193 /* if --all-symbols is not specified, then symbols outside the text 194 * and inittext sections are discarded */ 195 if (!all_symbols) { 196 if ((s->addr < _stext || s->addr > _etext) 197 && (s->addr < _sinittext || s->addr > _einittext)) 198 return 0; 199 /* Corner case. Discard any symbols with the same value as 200 * _etext or _einittext, they can move between pass 1 and 2 201 * when the kallsyms data is added. If these symbols move then 202 * they may get dropped in pass 2, which breaks the kallsyms 203 * rules. 204 */ 205 if ((s->addr == _etext && strcmp(s->sym + offset, "_etext")) || 206 (s->addr == _einittext && strcmp(s->sym + offset, "_einittext"))) 207 return 0; 208 } 209 210 /* Exclude symbols which vary between passes. */ 211 if (strstr(s->sym + offset, "_compiled.")) 212 return 0; 213 214 for (i = 0; special_symbols[i]; i++) 215 if( strcmp(s->sym + offset, special_symbols[i]) == 0 ) 216 return 0; 217 218 return 1; 219} 220 221static void 222read_map(FILE *in) 223{ 224 while (!feof(in)) { 225 if (cnt >= size) { 226 size += 10000; 227 table = realloc(table, sizeof(*table) * size); 228 if (!table) { 229 fprintf(stderr, "out of memory\n"); 230 exit (1); 231 } 232 } 233 if (read_symbol(in, &table[cnt]) == 0) 234 cnt++; 235 } 236} 237 238static void output_label(char *label) 239{ 240 if (symbol_prefix_char) 241 printf(".globl %c%s\n", symbol_prefix_char, label); 242 else 243 printf(".globl %s\n", label); 244 printf("\tALGN\n"); 245 if (symbol_prefix_char) 246 printf("%c%s:\n", symbol_prefix_char, label); 247 else 248 printf("%s:\n", label); 249} 250 251/* uncompress a compressed symbol. When this function is called, the best table 252 * might still be compressed itself, so the function needs to be recursive */ 253static int expand_symbol(unsigned char *data, int len, char *result) 254{ 255 int c, rlen, total=0; 256 257 while (len) { 258 c = *data; 259 /* if the table holds a single char that is the same as the one 260 * we are looking for, then end the search */ 261 if (best_table[c][0]==c && best_table_len[c]==1) { 262 *result++ = c; 263 total++; 264 } else { 265 /* if not, recurse and expand */ 266 rlen = expand_symbol(best_table[c], best_table_len[c], result); 267 total += rlen; 268 result += rlen; 269 } 270 data++; 271 len--; 272 } 273 *result=0; 274 275 return total; 276} 277 278static void 279write_src(void) 280{ 281 int i, k, off, valid; 282 unsigned int best_idx[256]; 283 unsigned int *markers; 284 char buf[KSYM_NAME_LEN+1]; 285 286 printf("#include <asm/types.h>\n"); 287 printf("#if BITS_PER_LONG == 64\n"); 288 printf("#define PTR .quad\n"); 289 printf("#define ALGN .align 8\n"); 290 printf("#else\n"); 291 printf("#define PTR .long\n"); 292 printf("#define ALGN .align 4\n"); 293 printf("#endif\n"); 294 295 printf(".data\n"); 296 297 output_label("kallsyms_addresses"); 298 valid = 0; 299 for (i = 0; i < cnt; i++) { 300 if (table[i].flags & SYM_FLAG_VALID) { 301 printf("\tPTR\t%#llx\n", table[i].addr); 302 valid++; 303 } 304 } 305 printf("\n"); 306 307 output_label("kallsyms_num_syms"); 308 printf("\tPTR\t%d\n", valid); 309 printf("\n"); 310 311 /* table of offset markers, that give the offset in the compressed stream 312 * every 256 symbols */ 313 markers = (unsigned int *) malloc(sizeof(unsigned int)*((valid + 255) / 256)); 314 315 output_label("kallsyms_names"); 316 valid = 0; 317 off = 0; 318 for (i = 0; i < cnt; i++) { 319 320 if (!table[i].flags & SYM_FLAG_VALID) 321 continue; 322 323 if ((valid & 0xFF) == 0) 324 markers[valid >> 8] = off; 325 326 printf("\t.byte 0x%02x", table[i].len); 327 for (k = 0; k < table[i].len; k++) 328 printf(", 0x%02x", table[i].sym[k]); 329 printf("\n"); 330 331 off += table[i].len + 1; 332 valid++; 333 } 334 printf("\n"); 335 336 output_label("kallsyms_markers"); 337 for (i = 0; i < ((valid + 255) >> 8); i++) 338 printf("\tPTR\t%d\n", markers[i]); 339 printf("\n"); 340 341 free(markers); 342 343 output_label("kallsyms_token_table"); 344 off = 0; 345 for (i = 0; i < 256; i++) { 346 best_idx[i] = off; 347 expand_symbol(best_table[i],best_table_len[i],buf); 348 printf("\t.asciz\t\"%s\"\n", buf); 349 off += strlen(buf) + 1; 350 } 351 printf("\n"); 352 353 output_label("kallsyms_token_index"); 354 for (i = 0; i < 256; i++) 355 printf("\t.short\t%d\n", best_idx[i]); 356 printf("\n"); 357} 358 359 360/* table lookup compression functions */ 361 362static inline unsigned int rehash_token(unsigned int hash, unsigned char data) 363{ 364 return ((hash * 16777619) ^ data); 365} 366 367static unsigned int hash_token(unsigned char *data, int len) 368{ 369 unsigned int hash=HASH_BASE_OFFSET; 370 int i; 371 372 for (i = 0; i < len; i++) 373 hash = rehash_token(hash, data[i]); 374 375 return HASH_FOLD(hash); 376} 377 378/* find a token given its data and hash value */ 379static struct token *find_token_hash(unsigned char *data, int len, unsigned int hash) 380{ 381 struct token *ptr; 382 383 ptr = hash_table[hash]; 384 385 while (ptr) { 386 if ((ptr->len == len) && (memcmp(ptr->data, data, len) == 0)) 387 return ptr; 388 ptr=ptr->next; 389 } 390 391 return NULL; 392} 393 394static inline void insert_token_in_group(struct token *head, struct token *ptr) 395{ 396 ptr->right = head->right; 397 ptr->right->left = ptr; 398 head->right = ptr; 399 ptr->left = head; 400} 401 402static inline void remove_token_from_group(struct token *ptr) 403{ 404 ptr->left->right = ptr->right; 405 ptr->right->left = ptr->left; 406} 407 408 409/* build the counts for all the tokens that start with "data", and have lenghts 410 * from 2 to "len" */ 411static void learn_token(unsigned char *data, int len) 412{ 413 struct token *ptr,*last_ptr; 414 int i, newprofit; 415 unsigned int hash = HASH_BASE_OFFSET; 416 unsigned int hashes[MAX_TOK_SIZE + 1]; 417 418 if (len > MAX_TOK_SIZE) 419 len = MAX_TOK_SIZE; 420 421 /* calculate and store the hash values for all the sub-tokens */ 422 hash = rehash_token(hash, data[0]); 423 for (i = 2; i <= len; i++) { 424 hash = rehash_token(hash, data[i-1]); 425 hashes[i] = HASH_FOLD(hash); 426 } 427 428 last_ptr = NULL; 429 ptr = NULL; 430 431 for (i = len; i >= 2; i--) { 432 hash = hashes[i]; 433 434 if (!ptr) ptr = find_token_hash(data, i, hash); 435 436 if (!ptr) { 437 /* create a new token entry */ 438 ptr = (struct token *) malloc(sizeof(*ptr)); 439 440 memcpy(ptr->data, data, i); 441 ptr->len = i; 442 443 /* when we create an entry, it's profit is 0 because 444 * we also take into account the size of the token on 445 * the compressed table. We then subtract GOOD_BAD_THRESHOLD 446 * so that the test to see if this token belongs to 447 * the good or bad list, is a comparison to zero */ 448 ptr->profit = -GOOD_BAD_THRESHOLD; 449 450 ptr->next = hash_table[hash]; 451 hash_table[hash] = ptr; 452 453 insert_token_in_group(&bad_head, ptr); 454 455 ptr->smaller = NULL; 456 } else { 457 newprofit = ptr->profit + (ptr->len - 1); 458 /* check to see if this token needs to be moved to a 459 * different list */ 460 if((ptr->profit < 0) && (newprofit >= 0)) { 461 remove_token_from_group(ptr); 462 insert_token_in_group(&good_head,ptr); 463 } 464 ptr->profit = newprofit; 465 } 466 467 if (last_ptr) last_ptr->smaller = ptr; 468 last_ptr = ptr; 469 470 ptr = ptr->smaller; 471 } 472} 473 474/* decrease the counts for all the tokens that start with "data", and have lenghts 475 * from 2 to "len". This function is much simpler than learn_token because we have 476 * more guarantees (tho tokens exist, the ->smaller pointer is set, etc.) 477 * The two separate functions exist only because of compression performance */ 478static void forget_token(unsigned char *data, int len) 479{ 480 struct token *ptr; 481 int i, newprofit; 482 unsigned int hash=0; 483 484 if (len > MAX_TOK_SIZE) len = MAX_TOK_SIZE; 485 486 hash = hash_token(data, len); 487 ptr = find_token_hash(data, len, hash); 488 489 for (i = len; i >= 2; i--) { 490 491 newprofit = ptr->profit - (ptr->len - 1); 492 if ((ptr->profit >= 0) && (newprofit < 0)) { 493 remove_token_from_group(ptr); 494 insert_token_in_group(&bad_head, ptr); 495 } 496 ptr->profit=newprofit; 497 498 ptr=ptr->smaller; 499 } 500} 501 502/* count all the possible tokens in a symbol */ 503static void learn_symbol(unsigned char *symbol, int len) 504{ 505 int i; 506 507 for (i = 0; i < len - 1; i++) 508 learn_token(symbol + i, len - i); 509} 510 511/* decrease the count for all the possible tokens in a symbol */ 512static void forget_symbol(unsigned char *symbol, int len) 513{ 514 int i; 515 516 for (i = 0; i < len - 1; i++) 517 forget_token(symbol + i, len - i); 518} 519 520/* set all the symbol flags and do the initial token count */ 521static void build_initial_tok_table(void) 522{ 523 int i, use_it, valid; 524 525 valid = 0; 526 for (i = 0; i < cnt; i++) { 527 table[i].flags = 0; 528 if ( symbol_valid(&table[i]) ) { 529 table[i].flags |= SYM_FLAG_VALID; 530 valid++; 531 } 532 } 533 534 use_it = 0; 535 for (i = 0; i < cnt; i++) { 536 537 /* subsample the available symbols. This method is almost like 538 * a Bresenham's algorithm to get uniformly distributed samples 539 * across the symbol table */ 540 if (table[i].flags & SYM_FLAG_VALID) { 541 542 use_it += WORKING_SET; 543 544 if (use_it >= valid) { 545 table[i].flags |= SYM_FLAG_SAMPLED; 546 use_it -= valid; 547 } 548 } 549 if (table[i].flags & SYM_FLAG_SAMPLED) 550 learn_symbol(table[i].sym, table[i].len); 551 } 552} 553 554/* replace a given token in all the valid symbols. Use the sampled symbols 555 * to update the counts */ 556static void compress_symbols(unsigned char *str, int tlen, int idx) 557{ 558 int i, len, learn, size; 559 unsigned char *p; 560 561 for (i = 0; i < cnt; i++) { 562 563 if (!(table[i].flags & SYM_FLAG_VALID)) continue; 564 565 len = table[i].len; 566 learn = 0; 567 p = table[i].sym; 568 569 do { 570 /* find the token on the symbol */ 571 p = (unsigned char *) strstr((char *) p, (char *) str); 572 if (!p) break; 573 574 if (!learn) { 575 /* if this symbol was used to count, decrease it */ 576 if (table[i].flags & SYM_FLAG_SAMPLED) 577 forget_symbol(table[i].sym, len); 578 learn = 1; 579 } 580 581 *p = idx; 582 size = (len - (p - table[i].sym)) - tlen + 1; 583 memmove(p + 1, p + tlen, size); 584 p++; 585 len -= tlen - 1; 586 587 } while (size >= tlen); 588 589 if(learn) { 590 table[i].len = len; 591 /* if this symbol was used to count, learn it again */ 592 if(table[i].flags & SYM_FLAG_SAMPLED) 593 learn_symbol(table[i].sym, len); 594 } 595 } 596} 597 598/* search the token with the maximum profit */ 599static struct token *find_best_token(void) 600{ 601 struct token *ptr,*best,*head; 602 int bestprofit; 603 604 bestprofit=-10000; 605 606 /* failsafe: if the "good" list is empty search from the "bad" list */ 607 if(good_head.right == &good_head) head = &bad_head; 608 else head = &good_head; 609 610 ptr = head->right; 611 best = NULL; 612 while (ptr != head) { 613 if (ptr->profit > bestprofit) { 614 bestprofit = ptr->profit; 615 best = ptr; 616 } 617 ptr = ptr->right; 618 } 619 620 return best; 621} 622 623/* this is the core of the algorithm: calculate the "best" table */ 624static void optimize_result(void) 625{ 626 struct token *best; 627 int i; 628 629 /* using the '\0' symbol last allows compress_symbols to use standard 630 * fast string functions */ 631 for (i = 255; i >= 0; i--) { 632 633 /* if this table slot is empty (it is not used by an actual 634 * original char code */ 635 if (!best_table_len[i]) { 636 637 /* find the token with the breates profit value */ 638 best = find_best_token(); 639 640 /* place it in the "best" table */ 641 best_table_len[i] = best->len; 642 memcpy(best_table[i], best->data, best_table_len[i]); 643 /* zero terminate the token so that we can use strstr 644 in compress_symbols */ 645 best_table[i][best_table_len[i]]='\0'; 646 647 /* replace this token in all the valid symbols */ 648 compress_symbols(best_table[i], best_table_len[i], i); 649 } 650 } 651} 652 653/* start by placing the symbols that are actually used on the table */ 654static void insert_real_symbols_in_table(void) 655{ 656 int i, j, c; 657 658 memset(best_table, 0, sizeof(best_table)); 659 memset(best_table_len, 0, sizeof(best_table_len)); 660 661 for (i = 0; i < cnt; i++) { 662 if (table[i].flags & SYM_FLAG_VALID) { 663 for (j = 0; j < table[i].len; j++) { 664 c = table[i].sym[j]; 665 best_table[c][0]=c; 666 best_table_len[c]=1; 667 } 668 } 669 } 670} 671 672static void optimize_token_table(void) 673{ 674 memset(hash_table, 0, sizeof(hash_table)); 675 676 good_head.left = &good_head; 677 good_head.right = &good_head; 678 679 bad_head.left = &bad_head; 680 bad_head.right = &bad_head; 681 682 build_initial_tok_table(); 683 684 insert_real_symbols_in_table(); 685 686 /* When valid symbol is not registered, exit to error */ 687 if (good_head.left == good_head.right && 688 bad_head.left == bad_head.right) { 689 fprintf(stderr, "No valid symbol.\n"); 690 exit(1); 691 } 692 693 optimize_result(); 694} 695 696 697int 698main(int argc, char **argv) 699{ 700 if (argc >= 2) { 701 int i; 702 for (i = 1; i < argc; i++) { 703 if(strcmp(argv[i], "--all-symbols") == 0) 704 all_symbols = 1; 705 else if (strncmp(argv[i], "--symbol-prefix=", 16) == 0) { 706 char *p = &argv[i][16]; 707 /* skip quote */ 708 if ((*p == '"' && *(p+2) == '"') || (*p == '\'' && *(p+2) == '\'')) 709 p++; 710 symbol_prefix_char = *p; 711 } else 712 usage(); 713 } 714 } else if (argc != 1) 715 usage(); 716 717 read_map(stdin); 718 optimize_token_table(); 719 write_src(); 720 721 return 0; 722} 723