diff options
Diffstat (limited to 'tools/parse_companies.pl')
-rwxr-xr-x | tools/parse_companies.pl | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/tools/parse_companies.pl b/tools/parse_companies.pl new file mode 100755 index 000000000..6dc358ee0 --- /dev/null +++ b/tools/parse_companies.pl @@ -0,0 +1,59 @@ +#!/usr/bin/perl + +# parse companies from +# https://www.bluetooth.com/specifications/assigned-numbers/company-identifiers + +use strict; +# use URI::Encode qw(uri_decode); + +my %known_entities = ( + 'nbsp' => ' ', + 'eacute' => 'é', + 'auml' => 'ä', +); + +# better to use URI::Encode if you have it +sub uri_decode { + my $name = $_[0]; + foreach my $entity (keys %known_entities) { + my $to = $known_entities{$entity}; + $name =~ s/&$entity;/$to/g; + } + foreach my $entity (map { lc $_ } $name =~ /&([^;]+);/g) { + if ($entity ne 'amp') { + print "Unable to convert &$entity;, giving up\n"; + exit 1; + } + } + $name =~ s/&/&/ig; + $name =~ s/ / /ig; + return $name; +} + +# never parse HTML with regex! +# except when you should + +my $identifier; +my $next_is_name = 0; + +while (<>) { + s/\xe2\x80\x8b//g; # kill zero width space + + # grab identifier (in hex) + if (/\<td.*(0x[0-9A-F]{4})/i) { + $identifier = $1; + $next_is_name = 1; + + # next <td> should be company name + } elsif ($next_is_name && m|\<td.*\>(.*)\</td\>|) { + my $name = uri_decode($1); + $name =~ s/^\s+//g; # kill leading + $name =~ s/\s+$//g; # and trailing space + my $id = hex($identifier); + if ($id != 65535) { + print "\tcase $id:\n"; + print "\t\treturn \"$name\";\n"; + } + $next_is_name = 0; + } +} |