Added fetchers concept: seperate scripts to fetch the feeds

Fetchers claim to be a certain client. They try to send the same
headers as the original client. That's better than a simple curl request
with a fake user agent, because curl doesn't send the other headers like
the original client and therefore its traffic stands out.
This commit is contained in:
Albert S. 2017-08-11 12:57:30 +02:00
parent 8a80aa0d6d
commit 3a723b9440
6 changed files with 40 additions and 9 deletions

10
fetchers/chrome Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
#Tries more or less to look like Chrome
if [ $# -ne 2 ] ; then
echo "usage: $0 url output" 1>&2
exit 1
fi
#better randomize
useragent=$(shuf -n 1 $RANDRSS_ROOT/fetchers/chrome_agents)
curl "$1" -H 'Accept-Encoding: gzip, deflate, br' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H "User-Agent: $useragent" -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' -H 'Connection: keep-alive' -H 'Cache-Control: max-age=0' --compressed > $2

1
fetchers/chrome_agents Normal file
View File

@ -0,0 +1 @@
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36

11
fetchers/firefox Executable file
View File

@ -0,0 +1,11 @@
#!/bin/sh
set -x
#Tries more or less to look like Firefox
if [ $# -ne 2 ] ; then
echo "usage: $0 url output" 1>&2
exit 1
fi
#better randomize
useragent=$(shuf -n 1 $RANDRSS_ROOT/fetchers/firefox_agents)
curl "$1" -H "User-Agent: $useragent" -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' -H 'Accept-Encoding: gzip, deflate, br' --compressed -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' > $2

2
fetchers/firefox_agents Normal file
View File

@ -0,0 +1,2 @@
Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0
Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0

2
fetcherslist Normal file
View File

@ -0,0 +1,2 @@
fetchers/firefox
fetchers/chrome

23
randrss
View File

@ -1,21 +1,28 @@
#!/bin/bash
set -x
set -e
#TODO: make this more robust
export RANDRSS_ROOT=$(pwd)
random_default=$( shuf -n 1 -i720-753)
DEFAULT_PER_ITEM="1-$random_default"
echo "Current default sleep seconds range: $DEFAULT_PER_ITEM"
if [ $# -lt 1 ] ; then
echo "Usage: $0 inputfile [user agents] [syncnow]"
echo "Usage: $0 inputfile fetchersfile [syncnow]"
exit
fi
inputfile="$1"
useragentsfile="$2"
fetchersfile="$2"
if [ ! -f "$inputfile" ] ; then
echo "inputfile does not exist or is not readable" 1>&2
exit 1
fi
if [ ! -f "$fetchersfile" ] ; then
echo "fetchersfile does not exist or is not readable" 1>&2
exit 1
fi
syncnow=0
if [ "$3" = "syncnow" ] ; then
syncnow=1
@ -31,13 +38,11 @@ while true ; do
else
sleepfor=$( shuf -i "$DEFAULT_PER_ITEM" -n 1)
fi
useragent=""
if [ -n "$useragentsfile" ] ; then
useragent=$( shuf -n 1 "$useragentsfile" )
fi
echo "Sleeping for $sleepfor seconds for $url"
[ $syncnow -eq 1 ] || sleep "$sleepfor"
torsocks wget "$url" -U "$useragent" -O "$output" || echo "Failed to fetch $url"
fetcher=$( shuf -n 1 "$fetchersfile" )
[ $syncnow -eq 1 ] || ( echo "Sleeping for $sleepfor seconds for $url, chosen fetcher $fetcher" && sleep "$sleepfor" )
echo "Fetching $url with $fetcher"
torsocks ./$fetcher "$url" "$output" || echo "Failed to fetch $url"
done
[ $syncnow -eq 1 ] && exit
done