[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [TV] Formatting listings like the radiotimes website
Andrew Flegg wrote:
On Mon, Mar 07, 2005 at 05:43:14AM +0000, Richard Lewis wrote:
Do you plan to add the other ITV regions? (also things like BBC 2
Wales, and there were some other digital terrestrial channels not
available)
Different regions are planned, but I'm only really motivated if their
listings dramatically differ; other channel requests are welcome, but
will depend on them being on a network which already has a parser or
having a decent website with the wanted information on it.
The attached perl script will do that for the ITV channels. It pulls the
TV listing off the Guardian newspaper website to create an xml file in
the same format as the current bleb.org xml output.
#!/usr/bin/perl
use strict;
use English;
use LWP::Simple qw/$ua get/;
my $useragent = 'BlipVert ITV Component 0.1';
my $from = 'steve@xxxxxxxxxxxxxx';
# guardian tv listings format
# title between
# <!-- Header -->
# <FONT FACE="Arial,Helvetica,sans-serif" SIZE="5"><B>
# <!-- header in size 5; arial,helvetica,sans-serif -->
# <!-- INSERT CHANNEL NAME -->
# Tyne Tees
# </B></FONT><HR SIZE=1 NOSHADE>
# <!-- End header -->
# each program between
# <!-- start of TV style --> and
# <!-- end of TV style -->
# adjust useragent settings
$ua->agent($useragent);
$ua->from($from);
#########################################################################
sub convert_time {
# convert AM/PM type timestr to 24 hour
my $timestr = shift();
# ie 10.30AM
my $noon = $timestr;
$noon =~ s/\d||\.//g;
$timestr =~ s/(am)|(pm)//g;
my ($hour, $min) = split ('\.', $timestr);
if ($noon eq 'pm') {
$hour += 12;
}
if ($hour == 24) { $hour = '00'; }
if (length($hour) < 2) { $hour = '0' . $hour; }
if (length($min) < 2) { $min = '0' . $min; }
my $time = $hour . $min;
return $time;
}
#########################################################################
sub process_channel {
# the main bit that does just about everything
my %parms = @_;
# fetch and write html page
my $content = get($parms{url});
open CHANNEL, ">/tmp/$PID.0.html";
print CHANNEL $content;
close CHANNEL;
# read html page and prep for output
open IN, "/tmp/$PID.0.html";
open OUT, ">$parms{xmlout}";
$RS = ' of TV style -->';
my @programs = <IN>;
shift @programs;
close(IN);
unlink "/tmp/$PID.0.html";
# drop some of the html - leaves bold around time and title
# so I can split on that later
foreach my $program (@programs) {
$program =~ s/(<TD.*)|(<TR>.*)|(<FONT.*)//ig;
$program =~ s/<!--.*//g;
$program =~ s/(<\/TD>)|(<\/TR>)//ig;
$program =~ s/(<\/FONT>)//ig;
$program =~ s/\s{2,}//g;
$program =~ s/&.*\s//g; # sgml character entities confuse xml parser
}
# date for xml header
my @timestr = localtime;
my $day = $timestr[3];
my $mon = ++$timestr[4];
my $year = $timestr[5] + 1900;
my $date = $day . '/' . $mon . '/' . $year;
# output XML header
print OUT '<?xml version="1.0" encoding="UTF-8"?>', "\n";
print OUT '<channel id="', $parms{name}, '" source="The Guardian" ',
'date="', $date, '">';
# remove empty programs
my @progs;
foreach my $show (@programs) {
if (defined($show)) {
push @progs, $show;
}
}
# dump show titles and timings
for (my $i = 0; $i < @programs; ++$i) {
my $program = $programs[$i];
my $j = $i + 2;
my $nextprog = $programs[$j];
$program =~ s/<B>//g;
$nextprog =~ s/<B>//g;
my ($start, $title, $desc) = split(/<\/B>/, $program);
($start, $title, $desc) =~ s/^ | $//g;
my ($nextstart, $nexttitle, $nextdesc) = split(/<\/B>/, $nextprog);
($nextstart, $nexttitle, $nextdesc) =~ s/^ | $//g;
$start = convert_time($start);
$nextstart = convert_time($nextstart);
if ($start and $title and $desc) {
print OUT "
<programme>
<desc>$desc</desc>
<title>$title</title>
<end>$nextstart</end>
<infourl>Sorry, information not provided by source The Guardian</infourl>
<start>$start</start>
</programme>
";
}
}
print OUT "</channel>\n";
close OUT;
}
#########################################################################
process_channel(url => 'http://www.guardian.co.uk/TV/tyne_tyne.html',
name => 'ITV Tyne Tees',
xmlout => '/home/steve/public_html/TV/tyne_tees.xml');
# Anglia
# http://www.guardian.co.uk/TV/anglia_anglia.html
# Border
# http://www.guardian.co.uk/TV/border_border.html
# Carlton
# process_channel(url => 'http://www.guardian.co.uk/TV/carlton_carlton.html',
# name => 'ITV Carlton',
# xmlout => '/home/steve/public_html/TV/carlton.xml');
# Central
# http://www.guardian.co.uk/TV/central_central.html
# Channel
# http://www.guardian.co.uk/TV/channel_channel.html
# Grampian
# http://www.guardian.co.uk/TV/grampian_grampian.html
# Granada
# http://www.guardian.co.uk/TV/granada_granada.html
# HTV Wales
# http://www.guardian.co.uk/TV/htvwales_htv_wales.html
# HTV West
# http://www.guardian.co.uk/TV/htvwest_htv_west.html
# Meridian
# http://www.guardian.co.uk/TV/meridian_meridian.html
# Tyne Tees
# http://www.guardian.co.uk/TV/tyne_tyne.html
# Scottish
# http://www.guardian.co.uk/TV/scottish_scottish.html
# Ulster
# http://www.guardian.co.uk/TV/ulster_ulster.html
# West Country
# http://www.guardian.co.uk/TV/westcountry_westcountry.html
# Yorkshire
# http://www.guardian.co.uk/TV/ytv_ytv.html
# South East
# http://www.guardian.co.uk/TV/meridian_southeast.html
# London
# http://www.guardian.co.uk/TV/carlton_london.html
# Scotland
# http://www.guardian.co.uk/TV/bordern_border_north.html