From 3a723b94403211c9dbd97c5bbdfcc12a88ebf1c3 Mon Sep 17 00:00:00 2001 From: Albert S Date: Fri, 11 Aug 2017 12:57:30 +0200 Subject: [PATCH] Added fetchers concept: seperate scripts to fetch the feeds Fetchers claim to be a certain client. They try to send the same headers as the original client. That's better than a simple curl request with a fake user agent, because curl doesn't send the other headers like the original client and therefore its traffic stands out. --- fetchers/chrome | 10 ++++++++++ fetchers/chrome_agents | 1 + fetchers/firefox | 11 +++++++++++ fetchers/firefox_agents | 2 ++ fetcherslist | 2 ++ randrss | 23 ++++++++++++++--------- 6 files changed, 40 insertions(+), 9 deletions(-) create mode 100755 fetchers/chrome create mode 100644 fetchers/chrome_agents create mode 100755 fetchers/firefox create mode 100644 fetchers/firefox_agents create mode 100644 fetcherslist diff --git a/fetchers/chrome b/fetchers/chrome new file mode 100755 index 0000000..21fcfbe --- /dev/null +++ b/fetchers/chrome @@ -0,0 +1,10 @@ +#!/bin/sh +#Tries more or less to look like Chrome +if [ $# -ne 2 ] ; then +echo "usage: $0 url output" 1>&2 +exit 1 +fi +#better randomize +useragent=$(shuf -n 1 $RANDRSS_ROOT/fetchers/chrome_agents) + +curl "$1" -H 'Accept-Encoding: gzip, deflate, br' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H "User-Agent: $useragent" -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' -H 'Connection: keep-alive' -H 'Cache-Control: max-age=0' --compressed > $2 diff --git a/fetchers/chrome_agents b/fetchers/chrome_agents new file mode 100644 index 0000000..6679857 --- /dev/null +++ b/fetchers/chrome_agents @@ -0,0 +1 @@ +User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36 diff --git a/fetchers/firefox b/fetchers/firefox new file mode 100755 index 0000000..11e9bc8 --- /dev/null +++ b/fetchers/firefox @@ -0,0 +1,11 @@ +#!/bin/sh +set -x +#Tries more or less to look like Firefox +if [ $# -ne 2 ] ; then +echo "usage: $0 url output" 1>&2 +exit 1 +fi +#better randomize +useragent=$(shuf -n 1 $RANDRSS_ROOT/fetchers/firefox_agents) + +curl "$1" -H "User-Agent: $useragent" -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' -H 'Accept-Encoding: gzip, deflate, br' --compressed -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' > $2 diff --git a/fetchers/firefox_agents b/fetchers/firefox_agents new file mode 100644 index 0000000..8af2681 --- /dev/null +++ b/fetchers/firefox_agents @@ -0,0 +1,2 @@ +Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0 +Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0 diff --git a/fetcherslist b/fetcherslist new file mode 100644 index 0000000..c47f875 --- /dev/null +++ b/fetcherslist @@ -0,0 +1,2 @@ +fetchers/firefox +fetchers/chrome diff --git a/randrss b/randrss index 0ca6c5f..705c28e 100755 --- a/randrss +++ b/randrss @@ -1,21 +1,28 @@ #!/bin/bash set -x set -e +#TODO: make this more robust +export RANDRSS_ROOT=$(pwd) random_default=$( shuf -n 1 -i720-753) DEFAULT_PER_ITEM="1-$random_default" echo "Current default sleep seconds range: $DEFAULT_PER_ITEM" if [ $# -lt 1 ] ; then -echo "Usage: $0 inputfile [user agents] [syncnow]" +echo "Usage: $0 inputfile fetchersfile [syncnow]" exit fi inputfile="$1" -useragentsfile="$2" +fetchersfile="$2" if [ ! -f "$inputfile" ] ; then echo "inputfile does not exist or is not readable" 1>&2 exit 1 fi +if [ ! -f "$fetchersfile" ] ; then +echo "fetchersfile does not exist or is not readable" 1>&2 +exit 1 +fi + syncnow=0 if [ "$3" = "syncnow" ] ; then syncnow=1 @@ -31,13 +38,11 @@ while true ; do else sleepfor=$( shuf -i "$DEFAULT_PER_ITEM" -n 1) fi - useragent="" - if [ -n "$useragentsfile" ] ; then - useragent=$( shuf -n 1 "$useragentsfile" ) - fi - echo "Sleeping for $sleepfor seconds for $url" - [ $syncnow -eq 1 ] || sleep "$sleepfor" - torsocks wget "$url" -U "$useragent" -O "$output" || echo "Failed to fetch $url" + fetcher=$( shuf -n 1 "$fetchersfile" ) + + [ $syncnow -eq 1 ] || ( echo "Sleeping for $sleepfor seconds for $url, chosen fetcher $fetcher" && sleep "$sleepfor" ) + echo "Fetching $url with $fetcher" + torsocks ./$fetcher "$url" "$output" || echo "Failed to fetch $url" done [ $syncnow -eq 1 ] && exit done