12a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#!/usr/bin/perl -w
21320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci# Copyright 2013 The Chromium Authors. All rights reserved.
32a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)# Use of this source code is governed by a BSD-style license that can be
42a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)# found in the LICENSE file.
52a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
61320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci# Use: echo filename1.cc ... | find_copyrights.pl
71320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#  or: find_copyrights.pl list_file
81320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#  or: find_files.pl ... | find_copyrights.pl
92a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
102a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)use strict;
112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)use warnings;
122a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)use File::Basename;
132a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
142a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)sub check_is_generated_file($);
152a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)sub start_copyright_parsing();
162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)my $progname = basename($0);
182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)my $generated_file_scan_boundary = 25;
201320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucciwhile (<>) {
211320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    chomp;
221320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    my $file = $_;
232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    my $file_header = '';
242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    my %copyrights;
252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    open (F, "<$file") or die "$progname: Unable to access $file\n";
262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    my $parse_copyright = start_copyright_parsing();
272a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    while (<F>) {
282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        $file_header .= $_ unless $. > $generated_file_scan_boundary;
292a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        my $copyright_match = $parse_copyright->($_, $.);
302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        if ($copyright_match) {
312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            $copyrights{lc("$copyright_match")} = "$copyright_match";
322a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        }
332a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    }
342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    close(F);
35a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    my $copyright = join(" / ", sort values %copyrights);
362a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    print "$file\t";
372a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    if (check_is_generated_file($file_header)) {
382a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        print "GENERATED FILE";
392a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    } else {
402a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        print ($copyright or "*No copyright*");
412a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    }
422a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    print "\n";
432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
442a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
452a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)sub check_is_generated_file($) {
462a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    my $license = uc($_[0]);
472a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    # Remove Python multiline comments to avoid false positives
482a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    if (index($license, '"""') != -1) {
492a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        $license =~ s/"""[^"]*(?:"""|$)//mg;
502a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    }
512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    if (index($license, "'''") != -1) {
522a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        $license =~ s/'''[^']*(?:'''|$)//mg;
532a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    }
542a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    # Quick checks using index.
552a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    if (index($license, 'ALL CHANGES MADE IN THIS FILE WILL BE LOST') != -1) {
562a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        return 1;
572a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    }
582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    if (index($license, 'DO NOT EDIT') != -1 ||
592a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        index($license, 'DO NOT DELETE') != -1 ||
602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        index($license, 'GENERATED') != -1) {
612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        return ($license =~ /(All changes made in this file will be lost' .
622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            'DO NOT (EDIT|delete this file)|Generated (at|automatically|data)' .
632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            '|Automatically generated|\Wgenerated\s+(?:\w+\s+)*file\W)/i);
642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    }
652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    return 0;
662a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
672a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)sub are_within_increasing_progression($$$) {
692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    my $delta = $_[0] - $_[1];
702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    return $delta >= 0 && $delta <= $_[2];
712a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
722a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)sub start_copyright_parsing() {
742a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    my $max_line_numbers_proximity = 3;
752a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    # Set up the defaults the way that proximity checks will not succeed.
762a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    my $last_a_item_line_number = -200;
772a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    my $last_b_item_line_number = -100;
782a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
792a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    return sub {
802a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        my $line = $_[0];
812a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        my $line_number = $_[1];
822a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
832a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        # Remove C / C++ strings to avoid false positives.
842a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        if (index($line, '"') != -1) {
852a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            $line =~ s/"[^"\\]*(?:\\.[^"\\]*)*"//g;
862a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        }
872a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
882a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        my $uc_line = uc($line);
892a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
902a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        # Record '(a)' and '(b)' last occurences in C++ comments.
912a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        my $cpp_comment_idx = index($uc_line, '//');
922a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        if ($cpp_comment_idx != -1) {
932a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            if (index($uc_line, '(A)') > $cpp_comment_idx) {
942a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                $last_a_item_line_number = $line_number;
952a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            }
962a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            if (index($uc_line, '(B)') > $cpp_comment_idx) {
972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                $last_b_item_line_number = $line_number;
982a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            }
992a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        }
1002a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1012a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        # Fast bailout, uses the same patterns as the regexp.
1022a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        if (index($uc_line, 'COPYRIGHT') == -1 &&
1032a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            index($uc_line, 'COPR.') == -1 &&
1042a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            index($uc_line, '\x{00a9}') == -1 &&
1052a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            index($uc_line, '\xc2\xa9') == -1) {
1062a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1072a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            my $c_item_index = index($uc_line, '(C)');
1082a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            return '' if ($c_item_index == -1);
1092a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            # Filter out 'c' used as a list item inside C++ comments.
1102a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
1112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            if ($c_item_index > $cpp_comment_idx &&
1122a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                are_within_increasing_progression(
1132a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                    $line_number,
1142a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                    $last_b_item_line_number,
1152a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                    $max_line_numbers_proximity) &&
1162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                are_within_increasing_progression(
1172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                    $last_b_item_line_number,
1182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                    $last_a_item_line_number,
1192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                    $max_line_numbers_proximity)) {
1202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                return '';
1212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            }
1222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        }
1232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        my $copyright_indicator_regex =
1252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            '(?:copyright|copr\.|\x{00a9}|\xc2\xa9|\(c\))';
126a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        my $full_copyright_indicator_regex =
127a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            sprintf '(?:\W|^)%s(?::\s*|\s+)(\w.*)$', $copyright_indicator_regex;
1282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        my $copyright_disindicator_regex =
1292a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            '\b(?:info(?:rmation)?|notice|and|or)\b';
1302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        my $copyright = '';
132a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if ($line =~ m%$full_copyright_indicator_regex%i) {
1332a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            my $match = $1;
1342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            if ($match !~ m%^\s*$copyright_disindicator_regex%i) {
1352a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                $match =~ s/([,.])?\s*$//;
1362a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                $match =~ s/$copyright_indicator_regex//ig;
1372a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                $match =~ s/^\s+//;
1382a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                $match =~ s/\s{2,}/ /g;
1392a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                $match =~ s/\\@/@/g;
1402a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                $copyright = $match;
1412a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)            }
1422a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        }
1432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1442a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)        return $copyright;
1452a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    }
1462a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
147