Ako odfiltrovať crawler botov

Web crawler je bot, ktorý prechádza web a indexuje ho. Tiež je známy aj pod názvom indexer. Niekedy chcete logovať všetky kliky na určitú URL adresu (napr. z dôvodu štatistiky čítanosti, alebo konverzie pokiaľ je to nejaký affiliate link).

Problémom je, že takýto bot vlastne nie je reálnym užívateľom klikajúcim na Váš odkaz ale len automatickým programom. Tým pádom sa vlastne skresľuje štatistika klikov.

Riešením je odfiltrovať kliky obsahujúce slová, ktoré používajú boty a budú v premennej $_SERVER[‘HTTP_USER_AGENT’].

Databázu takýchto slov môžete získať z nejakého open source projektu – napr. PhpBB:

INSERT INTO `phpbb_bots` (`bot_id`, `bot_active`, `bot_name`, `user_id`, `bot_agent`, `bot_ip`) VALUES
(1, 1, 'AdsBot [Google]', 3, 'AdsBot-Google', ''),
(2, 1, 'Alexa [Bot]', 4, 'ia_archiver', ''),
(3, 1, 'Alta Vista [Bot]', 5, 'Scooter/', ''),
(4, 1, 'Ask Jeeves [Bot]', 6, 'Ask Jeeves', ''),
(5, 1, 'Baidu [Spider]', 7, 'Baiduspider+(', ''),
(6, 1, 'Exabot [Bot]', 8, 'Exabot/', ''),
(7, 1, 'FAST Enterprise [Crawler]', 9, 'FAST Enterprise Crawler', ''),
(8, 1, 'FAST WebCrawler [Crawler]', 10, 'FAST-WebCrawler/', ''),
(9, 1, 'Francis [Bot]', 11, 'http://www.neomo.de/', ''),
(10, 1, 'Gigabot [Bot]', 12, 'Gigabot/', ''),
(11, 1, 'Google Adsense [Bot]', 13, 'Mediapartners-Google', ''),
(12, 1, 'Google Desktop', 14, 'Google Desktop', ''),
(13, 1, 'Google Feedfetcher', 15, 'Feedfetcher-Google', ''),
(14, 1, 'Google [Bot]', 16, 'Googlebot', ''),
(15, 1, 'Heise IT-Markt [Crawler]', 17, 'heise-IT-Markt-Crawler', ''),
(16, 1, 'Heritrix [Crawler]', 18, 'heritrix/1.', ''),
(17, 1, 'IBM Research [Bot]', 19, 'ibm.com/cs/crawler', ''),
(18, 1, 'ICCrawler - ICjobs', 20, 'ICCrawler - ICjobs', ''),
(19, 1, 'ichiro [Crawler]', 21, 'ichiro/', ''),
(20, 1, 'Majestic-12 [Bot]', 22, 'MJ12bot/', ''),
(21, 1, 'Metager [Bot]', 23, 'MetagerBot/', ''),
(22, 1, 'MSN NewsBlogs', 24, 'msnbot-NewsBlogs/', ''),
(23, 1, 'MSN [Bot]', 25, 'msnbot/', ''),
(24, 1, 'MSNbot Media', 26, 'msnbot-media/', ''),
(25, 1, 'NG-Search [Bot]', 27, 'NG-Search/', ''),
(26, 1, 'Nutch [Bot]', 28, 'http://lucene.apache.org/nutch/', ''),
(27, 1, 'Nutch/CVS [Bot]', 29, 'NutchCVS/', ''),
(28, 1, 'OmniExplorer [Bot]', 30, 'OmniExplorer_Bot/', ''),
(29, 1, 'Online link [Validator]', 31, 'online link validator', ''),
(30, 1, 'psbot [Picsearch]', 32, 'psbot/0', ''),
(31, 1, 'Seekport [Bot]', 33, 'Seekbot/', ''),
(32, 1, 'Sensis [Crawler]', 34, 'Sensis Web Crawler', ''),
(33, 1, 'SEO Crawler', 35, 'SEO search Crawler/', ''),
(34, 1, 'Seoma [Crawler]', 36, 'Seoma [SEO Crawler]', ''),
(35, 1, 'SEOSearch [Crawler]', 37, 'SEOsearch/', ''),
(36, 1, 'Snappy [Bot]', 38, 'Snappy/1.1 ( http://www.urltrends.com/ )', ''),
(37, 1, 'Steeler [Crawler]', 39, 'http://www.tkl.iis.u-tokyo.ac.jp/~crawler/', ''),
(38, 1, 'Synoo [Bot]', 40, 'SynooBot/', ''),
(39, 1, 'Telekom [Bot]', 41, 'crawleradmin.t-info@telekom.de', ''),
(40, 1, 'TurnitinBot [Bot]', 42, 'TurnitinBot/', ''),
(41, 1, 'Voyager [Bot]', 43, 'voyager/1.0', ''),
(42, 1, 'W3 [Sitesearch]', 44, 'W3 SiteSearch Crawler', ''),
(43, 1, 'W3C [Linkcheck]', 45, 'W3C-checklink/', ''),
(44, 1, 'W3C [Validator]', 46, 'W3C_*Validator', ''),
(45, 1, 'WiseNut [Bot]', 47, 'http://www.WISEnutbot.com', ''),
(46, 1, 'YaCy [Bot]', 48, 'yacybot', ''),
(47, 1, 'Yahoo MMCrawler [Bot]', 49, 'Yahoo-MMCrawler/', ''),
(48, 1, 'Yahoo Slurp [Bot]', 50, 'Yahoo! DE Slurp', ''),
(49, 1, 'Yahoo [Bot]', 51, 'Yahoo! Slurp', ''),
(50, 1, 'YahooSeeker [Bot]', 52, 'YahooSeeker/', '');