#!/usr/bin/perl use strict; use warnings; use feature qw(say); use utf8; use Getopt::Long; use YAML::Any qw(LoadFile); use LWP::UserAgent; use XML::Feed; use HTML::Entities; ##### variables our $VERSION = '0.02'; my %opts = ( config => "$ENV{HOME}/.config/twatch-lite/config", state => "$ENV{HOME}/.config/twatch-lite/state", verbose => 0, ); ##### functions sub usage { print <] -c, --config=FILE Path to config file, mandatory default: $opts{config} -s, --state=FILE Path to file with feed positions, optional default: $opts{state} -v, --verbose Increase verbosity USAGE exit shift // 1; } sub _log { my ($level, $message) = @_; return if ($level eq 'debug' and not $opts{verbose}); print STDERR "[$level] ", $message, "\n"; exit(1) if $level eq 'fatal'; } sub load_config { my ($path) = @_; _log(debug => "Loading config"); _log(fatal => "Can't find config: no such file") unless -f $path; my $config; eval { $config = LoadFile($path); 1 } or do { _log(fatal => "Can't load config: $@"); }; _log(debug => "Sanity checks for config file"); _log(fatal => "Missing section `feeds` in config file") unless (exists $config->{feeds} and ref $config->{feeds} eq 'ARRAY'); _log(warn => "'outdir' not set in config, using /tmp"), $config->{outdir} = '/tmp' unless ($config->{outdir}); my $i = 0; foreach my $feed (@{ $config->{feeds} }) { $i++; _log(debug => "Processing feed #$i: $feed->{url}"); _log(fatal => "Feed is not a hashref") unless (ref $feed eq 'HASH'); _log(fatal => "Missing 'url' in feed") unless $feed->{url} and $feed->{url} =~ m{https?://}oi; _log(fatal => "Missing 'matches' section in feed") unless $feed->{matches} and ref $feed->{matches} eq 'ARRAY'; _log(fatal => "Empty 'matches' section in feed") unless (scalar @{ $feed->{matches} }); $feed->{linktype} //= 'direct'; $feed->{lookup} //= 'title'; # TODO: more ($feed->{hostname}) = ($feed->{url} =~ m{://([a-z0-9\.-]+)}oi); } return $config; } sub load_state { my ($path) = @_; my $state = {}; _log(debug => "Loading file with feed positions"); if (-f $path) { open my $FH, '<', $path; while (1) { last if eof($FH); my $line = <$FH>; chomp $line; next unless ($line =~ m{^([a-z0-9\.-]+),(\d+)}oi); $state->{$1} = $2; # hostname => lastseen } close $FH; } else { _log(warn => "Missing state file, will create one at exit"); } return $state; } sub process_feed { my ($ua, $feedconf) = @_; my $urls = {}; my $resp = $ua->get($feedconf->{url}); unless ($resp->is_success) { _log(error => "Can't fetch feed"); return $urls; } my $data = decode_entities($resp->decoded_content); my $feed = XML::Feed->parse(\$data); unless ($feed) { _log(error => "Can't parse feed: " . XML::Feed->errstr); return $urls; } my @entries = reverse $feed->entries; for my $entry (@entries) { my $time = (ref $entry->issued) ? $entry->issued->epoch : $entry->modified->epoch; if ($feedconf->{lastseen} >= $time) { _log(debug => "Already seen before: $time"); next; } my $text = ($feedconf->{lookup} eq 'body') ? $entry->content->body : $entry->title; _log(info => "Processing: " . $entry->title); _log(debug => "Match text is: ". $text); foreach my $match (@{ $feedconf->{matches} }) { if ($match =~ m{^/(.*)/$}) { _log(debug => "Trying regex '$1'"); next unless $text =~ qr/$1/i; } else { _log(debug => "Trying substr '$match'"); next unless index(lc($text), lc($match)) >= 0; } # gotcha! my $msg = sprintf "Got it! URL: %s (%s)", $entry->link, $entry->title; _log(info => $msg); $urls->{$entry->link} = $entry->title; } $feedconf->{lastseen} = $time; } return $urls; } sub fetch_torrents { my ($ua, $outdir, $urls) = @_; while (my ($url, $title) = each(%{ $urls })) { my $resp = $ua->get($url); unless ($resp->is_success) { _log(error => "Can't fetch torrent file: $url"); next; } my $file = sprintf "%s/%s", $outdir, $resp->filename; open my $T, '>', $file; unless ($T) { _log(error => "Can't save torrent to $file"); next; } print $T $resp->decoded_content; close $T; } return 1; } sub save_state { my ($path, $state) = @_; open my $FH, '>', "$path.new"; unless ($FH) { _log(fatal => "Can't open state file for write"); return; } while (my ($host, $time) = each(%{ $state })) { printf $FH "%s,%s\n", $host, $time; } close $FH; rename("$path.new", $path) or _log(error => "Can't update state file: $!"); return 1; } ##### logic GetOptions( "config=s" => \$opts{config}, "state=s" => \$opts{state}, "verbose" => \$opts{verbose}, ) or usage(1); my $config = load_config($opts{config}); my $state = load_state ($opts{state}); my $ua = LWP::UserAgent->new; $ua->agent("twatch-lite/$VERSION"); $ua->show_progress(1) if $opts{verbose}; if ($config->{proxy}) { $ua->proxy(['http', 'https'], $config->{proxy}); } else { $ua->env_proxy; } if ($config->{cookies}) { $ua->cookie_jar({ file => "$ENV{HOME}/.config/twatch-lite/cookies.txt" }); } foreach my $feed (@{ $config->{feeds} }) { my $host = $feed->{hostname}; $feed->{lastseen} = $state->{$host} // 0; my $urls = process_feed($ua, $feed); fetch_torrents($ua, $config->{outdir}, $urls) if scalar keys $urls; $state->{$host} = $feed->{lastseen}; } save_state($opts{state}, $state); exit 0; __END__ =pod =head1 NAME twatch-lite -- periodically polls RSS/ATOM feeds and cherry-picks wanted torrent files =head1 SYNOPSIS man twatch-lite mkdir -p ~/.config/twatch-lite vim ~/.config/twatch-lite/config twatch-lite --verbose # 1st time, debug =head1 DESCRIPTION This is very simple and specialized alternative to full-featured software called L. It works only with RSS/ATOM feeds. =head1 Config file File format is simple: YAML. Real config example: --- outdir: /tmp proxy: http://192.168.1.1:3128 feeds: - url: http://torrent-tracker.org/rss.php linktype: direct lookup: title matches: - 'match substring' - '/match regex/' Now let looks closer: we have a lot of global parameters and section 'feeds' =head2 Global parameters =head3 C Where to save downloaded torrent files. You may want configure your torrent-client to pickup files from this dir. If unset, you'll get warning, and '/tmp' will be used. =head3 C If set - will use http proxy, when fetching rss and torrent-files. Otherwise will connect directly (default). =head3 C If set to any non-empty value, will accept and send back cookies in http-requests. Default - unset. =head2 Section C Configured feeds. =head3 C URL with RSS/ATOM feed. This option must be set in each feed. =head3 C List of wanted 'keywords'. If keyword surrounded with slashes, like C, it will be handled as regex, and as substring otherwise. Regex syntax is PCRE and described in L. If feed entry matches ANY keyword, torrent file will be downloaded and stored to B. Matches list must contain at least one keyword. If you want all torrents from this feed, use '/.*/' regex keyword. =head3 C Lookup this field in RSS/ATOM entry for wanted keywords. Acceptable values: 'body', 'title' (default). =head3 C Type of link in RSS/ATOM entry. Possible values: * direct -- link points directly to torrent file (default) * page -- link points to page with link to torrent file (not implemented yet) =head1 SEE ALSO L =head1 AUTHORS Alex 'AdUser' Z =cut