summaryrefslogtreecommitdiff
path: root/tools/parse_companies.pl
blob: 6dc358ee0c1f7627a5edc40b72c52b7d9ea58a36 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/perl

# parse companies from
# https://www.bluetooth.com/specifications/assigned-numbers/company-identifiers

use strict;
# use URI::Encode qw(uri_decode);

my %known_entities = (
    'nbsp' => ' ',
    'eacute' => 'é',
    'auml' => 'ä',
);

# better to use URI::Encode if you have it
sub uri_decode {
    my $name = $_[0];
    foreach my $entity (keys %known_entities) {
        my $to = $known_entities{$entity};
        $name =~ s/&$entity;/$to/g;
    }
    foreach my $entity (map { lc $_ } $name =~ /&([^;]+);/g) {
        if ($entity ne 'amp') {
            print "Unable to convert &$entity;, giving up\n";
            exit 1;
        }
    }
    $name =~ s/&/&/ig;
    $name =~ s/ / /ig;
    return $name;
}

# never parse HTML with regex!
# except when you should

my $identifier;
my $next_is_name = 0;

while (<>) {
    s/\xe2\x80\x8b//g; # kill zero width space

    # grab identifier (in hex)
    if (/\<td.*(0x[0-9A-F]{4})/i) {
        $identifier = $1;
        $next_is_name = 1;

    # next <td> should be company name
    } elsif ($next_is_name && m|\<td.*\>(.*)\</td\>|) {
        my $name = uri_decode($1);
        $name =~ s/^\s+//g; # kill leading
        $name =~ s/\s+$//g; # and trailing space
        my $id = hex($identifier);
        if ($id != 65535) {
            print "\tcase $id:\n";
            print "\t\treturn \"$name\";\n";
        }
        $next_is_name = 0;
    }
}