| Line | Stmts. | Exclusive Time | Avg. |
| 1 | | | | # AWSTATS ROBOTS DATABASE |
| 2 | | | | #------------------------------------------------------- |
| 3 | | | | # If you want to add robots to extend AWStats database detection capabilities, |
| 4 | | | | # you must add an entry in RobotsSearchIDOrder_listx and RobotsHashIDLib. |
| 5 | | | | #------------------------------------------------------- |
| 6 | | | | # $Revision: 1.45 $ - $Author: eldy $ - $Date: 2007/04/02 18:30:53 $ |
| 7 | | | | |
| 8 | | | | # 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html |
| 9 | | | | # added dipsie (not tested with real data). |
| 10 | | | | # added DomainsDB.net http://domainsdb.net/ |
| 11 | | | | # added ia_archiver-web.archive.org (was inadvertently grouped with Alexa traffic) |
| 12 | | | | # added Nutch (used by looksmart (furl?)) |
| 13 | | | | # added rssImagesBot |
| 14 | | | | # added Sqworm |
| 15 | | | | # added t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e |
| 16 | | | | # added w3c css-validator |
| 17 | | | | # added documentation link to bot home pages for above and selected major bots. |
| 18 | | | | # In the case of international bots, choose .com page. |
| 19 | | | | # Included tool tip (html "title"). |
| 20 | | | | # To do: parameterize to match both AWStats language and tooltips settings. |
| 21 | | | | # To do: add html links for all bots based on current documentation in source |
| 22 | | | | # files referenced below. |
| 23 | | | | # changed '\wbot[\/\-]', to '\wbot[\/\-]' (removed comma) |
| 24 | | | | # made minor grammar corrections to notes below |
| 25 | | | | # 2005-08-24 added YahooSeeker-Testing |
| 26 | | | | # added w3c-checklink |
| 27 | | | | # updated url for ask.com |
| 28 | | | | # 2005-08-24 added Girafabot http://www.girafa.com/ |
| 29 | | | | # 2005-08-30 added PluckFeedCrawler http://www.pluck.com/ |
| 30 | | | | # added Gaisbot/3.0 (robot05@gais.cs.ccu.edu.tw; ) |
| 31 | | | | # dded geniebot (wgao@genieknows.com) |
| 32 | | | | # added BecomeBot link http://www.become.com/site_owners.html |
| 33 | | | | # added topicblogs http://www.topicblogs.com/ |
| 34 | | | | # added Powermarks; seen used by referrer spam |
| 35 | | | | # added YahooSeeker |
| 36 | | | | # added NG/2. http://www.exabot.com/ |
| 37 | | | | # 2005-09-15 added link for Walhello appie |
| 38 | | | | # added bender focused_crawler |
| 39 | | | | # updated YahooSeeker description (blog crawler) |
| 40 | | | | # 2005-09-16 added link for http://linkchecker.sourceforge.net |
| 41 | | | | # added ConveraCrawler/0.9d ( http://www.authoritativeweb.com/crawl) |
| 42 | | | | # added Blogslive info@blogslive.com intelliseek.com |
| 43 | | | | # added BlogPulse (ISSpider-3.0) intelliseek.com |
| 44 | | | | # 2005-09-26 added Feedfetcher-Google (http://www.google.com/feedfetcher.html) |
| 45 | | | | # added EverbeeCrawler |
| 46 | | | | # added Yahoo-Blogs http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html |
| 47 | | | | # added link for Bloglines http://www.bloglines.com |
| 48 | | | | # 2005-10-19 fixed Feedfetcher-Google (http://www.google.com/feedfetcher.html) |
| 49 | | | | # added Blogshares Spiders (Synchronized V1.5.1) |
| 50 | | | | # added yacy |
| 51 | | | | # 2005-11-21 added Argus www.simpy.com |
| 52 | | | | # added BlogsSay :: RSS Search Crawler (http://www.blogssay.com/) |
| 53 | | | | # added MJ12bot http://majestic12.co.uk/bot.php |
| 54 | | | | # added OpenTaggerBot (http://www.opentagger.com/opentaggerbot.htm) |
| 55 | | | | # added OutfoxBot/0.3 (For internet experiments; outfox.agent@gmail.com) |
| 56 | | | | # added RufusBot Rufus Web Miner http://64.124.122.252.webaroo.com/feedback.html |
| 57 | | | | # added Seekbot (http://www.seekbot.net/bot.html) |
| 58 | | | | # added Yahoo-MMCrawler/3.x (mms-mmcrawler-support@yahoo-inc.com) |
| 59 | | | | # added link for BaiDuSpider |
| 60 | | | | # added link for Blogshares Spider |
| 61 | | | | # added link for StackRambler http://www.rambler.ru/doc/faq.shtml |
| 62 | | | | # added link for WISENutbot |
| 63 | | | | # added link for ZyBorg/1.0 (wn-14.zyborg@looksmart.net; http://www.WISEnutbot.com. Moved location to above wisenut to avoid classification as wisenut |
| 64 | | | | # 2005-12-15 |
| 65 | | | | # added FAST Enteprise Crawler/6 (www dot fastsearch dot com). Note spelling Enteprise not Enterprise. |
| 66 | | | | # added findlinks http://wortschatz.uni-leipzig.de/findlinks/ |
| 67 | | | | # added IBM Almaden Research Center WebFountainâ„¢ http://www.almaden.ibm.com/cs/crawler [hc3] |
| 68 | | | | # added INFOMINE/8.0 VLCrawler (http://infomine.ucr.edu/useragents) |
| 69 | | | | # added lmspider (lmspider@scansoft.com) http://www.nuance.com/ |
| 70 | | | | # added noxtrumbot http://www.noxtrum.com/ |
| 71 | | | | # added SandCrawler (Microsoft) |
| 72 | | | | # added SBIder http://www.sitesell.com/sbider.html |
| 73 | | | | # added SeznamBot http://fulltext.seznam.cz/ |
| 74 | | | | # added sohu-search http://corp.sohu.com/ (looked for //robots.txt not /robots.txt) |
| 75 | | | | # added the ruffle SemanticWeb crawler v0.5 - http://www.unreach.net |
| 76 | | | | # added WebVulnCrawl/1.0 libwww-perl/5.803 (looked for //robots.txt not /robots.txt) |
| 77 | | | | # added Yahoo! Japan keyoshid http://www.yahoo.co.jp/ |
| 78 | | | | # added Y!J http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html |
| 79 | | | | # added link for GigaBot |
| 80 | | | | # added link for MagpieRSS |
| 81 | | | | # added link for MSIECrawler |
| 82 | | | | # 2005-12-21 |
| 83 | | | | # added aipbot http://www.aipbot.com aipbot@aipbot.com [matthys70 users.sourceforge.net] |
| 84 | | | | # added Everest-Vulcan Inc./0.1 (R&D project; http://everest.vulcan.com/crawlerhelp) |
| 85 | | | | # added Fast-Search-Engine http://www.fast-search-engine.com/ [matthys70 users.sourceforge.net] |
| 86 | | | | # added g2Crawler (nobody@airmail.net) http://crawler.instantnetworks.net/ |
| 87 | | | | # added Jakarta commons-httpclient http://jakarta.apache.org/commons/httpclient/ (hit robots.txt). May be used as robot or browser - a site may want to remove this entry. |
| 88 | | | | # added OmniExplorer_Bot http://www.omni-explorer.com/ [matthys70 users.sourceforge.net] |
| 89 | | | | # added USTC-Semantic-Group ai.ustc.edu.cn/mas/en/research/index.php ? |
| 90 | | | | # 2005-12-22 |
| 91 | | | | # added EARTHCOM.info www.earthcom.info |
| 92 | | | | # added HTTrack off-line browser 'httrack','HTTrack', http://www.httrack.com/ [Moizes Gabor] |
| 93 | | | | # added KummHttp http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b [Moizes Gabor] |
| 94 | | | | # 2006-01-01 |
| 95 | | | | # added Dulance http://www.dulance.com/bot.jsp |
| 96 | | | | # added MojeekBot http://www.mojeek.com/bot.html |
| 97 | | | | # added nicebot http://www.egghelp.org/setup.htm ? |
| 98 | | | | # added Snappy http://www.urltrends.com/faq.php |
| 99 | | | | # added sohu agent |
| 100 | | | | # added TencentTraveler |
| 101 | | | | # added VORTEX http://marty.anstey.ca/robots/vortex/ [matthys70 users.sourceforge.net] |
| 102 | | | | # added zspider http://feedback.redkolibri.com/ |
| 103 | | | | # 2006-01-13 |
| 104 | | | | # added boitho.com-dc http://www.boitho.com/dcbot.html |
| 105 | | | | # added IRLbot http://irl.cs.tamu.edu/crawler |
| 106 | | | | # added virus_detector virus_harvester@securecomputing.com |
| 107 | | | | # added Wavefire http://www.wavefire.com; info@wavefire.com |
| 108 | | | | # added WebFilter Robot |
| 109 | | | | # 2006-01-24 |
| 110 | | | | # added Shim-Crawler http://www.logos.ic.i.u-tokyo.ac.jp/crawler/; crawl@logos.ic.i.u-tokyo.ac.jp |
| 111 | | | | # added Exabot exabot.com |
| 112 | | | | # added LetsCrawl.com http://letscrawl.com |
| 113 | | | | # added ichiro http://help.goo.ne.jp/door/crawlerE.html |
| 114 | | | | # 2006-01-27 additional 22 robots from a list provided by Moizes Gabor |
| 115 | | | | # added ALeadSoftbot http://www.aleadsoft.com/bot.htm |
| 116 | | | | # added CipinetBot http://www.cipinet.com/bot.html |
| 117 | | | | # added Cuasarbot http://www.cuasar.com/ |
| 118 | | | | # added Dumbot http://www.dumbfind.com/ |
| 119 | | | | # added Extreme_Picture_Finder http://www.exisoftware.com/ |
| 120 | | | | # added Fooky.com/ScorpionBot/ScoutOut http://www.fooky.com/scorpionbots |
| 121 | | | | # added IlTrovatore-Setaccio http://www.iltrovatore.it/aiuto/motore_di_ricerca.html bot@iltrovatore.it |
| 122 | | | | # added InsurancoBot http://www.fastspywareremoval.com/ |
| 123 | | | | # added InternetArchive http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org |
| 124 | | | | # added KazoomBot http://www.kazoom.ca/bot.html kazoombot@kazoom.ca |
| 125 | | | | # added Kurzor http://www.easymail.hu/ cursor@easymail.hu |
| 126 | | | | # added NutchCVS http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org |
| 127 | | | | # added NutchOSU-VLIB http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org |
| 128 | | | | # added Orbiter http://www.dailyorbit.com/bot.htm |
| 129 | | | | # added PHP_version_tracker http://www.nexen.net/phpversion/bot.php |
| 130 | | | | # added SuperBot http://www.sparkleware.com/superbot/ |
| 131 | | | | # added SynooBot http://www.synoo.de/bot.html webmaster@synoo.com |
| 132 | | | | # added TestBot http://www.agbrain.com/ |
| 133 | | | | # added TutorGigBot http://www.tutorgig.info/ |
| 134 | | | | # added UP.Browser http://developer.openwave.com/dvl/support/faqs/faq_mag_browser.htm |
| 135 | | | | # added WebIndexer mailto://webindexerv1@yahoo.com |
| 136 | | | | # added WebMiner http://64.124.122.252/feedback.html |
| 137 | | | | # 2006-02-01 |
| 138 | | | | # added heritrix https://sourceforge.net/forum/message.php?msg_id=3550202 |
| 139 | | | | # added Zeus Webster Pro https://sourceforge.net/forum/message.php?msg_id=3141164 |
| 140 | | | | # additional robots from a list provided by Moizes Gabor [ mojzi -a-t- free mail hu ] |
| 141 | | | | # added Candlelight_Favorites_Inspector |
| 142 | | | | # added DomainChecker |
| 143 | | | | # added EasyDL |
| 144 | | | | # added FavOrg |
| 145 | | | | # added Favorites_Sweeper |
| 146 | | | | # added Html_Link_Validator |
| 147 | | | | # added Internet_Ninja |
| 148 | | | | # added JRTwine_Software_Check_Favorites_Utility |
| 149 | | | | # fixed Microsoft_URL_Control |
| 150 | | | | # added miniRank |
| 151 | | | | # added Missigua_Locator |
| 152 | | | | # added NPBot |
| 153 | | | | # added Ocelli |
| 154 | | | | # added Onet.pl_SA |
| 155 | | | | # added proodleBot |
| 156 | | | | # added SearchGuild_DMOZ_Experiment |
| 157 | | | | # added Susie |
| 158 | | | | # added Website_Monitoring_Bot |
| 159 | | | | # added Xenu_Link_Sleuth |
| 160 | | | | # 2006-05-15 |
| 161 | | | | # added ASPseek http://www.aspseek.org/ |
| 162 | | | | # added AdamM Bot http://home.blic.net/adamm/ |
| 163 | | | | # added archive.org_bot http://crawls.archive.org/collections/bncf/crawl.html |
| 164 | | | | # added arianna.libero.it (Italian Portal/search engine) |
| 165 | | | | # added Biz360 spider http://www.biz360.com |
| 166 | | | | # added BlogBridge Service http://www.blogbridge.com/ |
| 167 | | | | # added BlogSearch http://www.icerocket.com/ |
| 168 | | | | # added libcrawl |
| 169 | | | | # added edgeio-relanshanbottriever http://www.edgeio.com |
| 170 | | | | # added FeedFlow http://feedflow.com/about |
| 171 | | | | # added Biblioteca Nazionale Centrale di Firenze (Italian National Archive) http://www.bncf.firenze.sbn.it/raccolta.txt |
| 172 | | | | # added Java catchall - used by many spam bots |
| 173 | | | | # added lanshanbot http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb |
| 174 | | | | # added msnbot-media http://search.msn.com/msnbot.htm |
| 175 | | | | # added MT::Telegraph::Agent |
| 176 | | | | # added Netluchs http://www.netluchs.de/ (German SE bot) |
| 177 | | | | # added oBot http://www.webmasterworld.com/forum11/1616.htm |
| 178 | | | | # added Onfolio http://www.onfolio.com/ (IE Toolbar plugin) - hit rss feeds. |
| 179 | | | | # added ping.blo.gs http://blo.gs/ping.php blog bot |
| 180 | | | | # added sogou spider http://corp.sohu.com/20051130/n240842344.shtml |
| 181 | | | | # added sogou test http://corp.sohu.com/20051130/n240842344.shtml |
| 182 | | | | # added Sphere Scout http://www.sphere.com/ |
| 183 | | | | # added sproose crawler http://www.sproose.com/bot.html |
| 184 | | | | # added SyndicAPI http://syndicapi.com/bot.html |
| 185 | | | | # added Yahoo! Mindset http://mindset.research.yahoo.com/ |
| 186 | | | | # added msrabot |
| 187 | | | | # added Vagabondo & Vagabondo-WAP http://www.wise-guys.nl/Contact/index.php?botselected=webagents&lang=uk |
| 188 | | | | # fixed Missigua Locator detection (Missigua_Locator -> Missigua Locator) |
| 189 | | | | # changed echo to echo! to avoid conflict with the bonecho (Firefox 2.0) browser. |
| 190 | | | | # This requires you to reprocess historic logs if you want EchO! to be recognized for older reports. |
| 191 | | | | # 2006-05-17 |
| 192 | | | | # added Alpha Search Agent # 62.152.125.60 Eurologon Srl |
| 193 | | | | # added Krugle http://www.krugle.com/crawler/info.html the search engine for developers |
| 194 | | | | # added Octora Beta Bot http://www.octora.com/ # Blog and Rss Search Engine |
| 195 | | | | # added UbiCrawler http://law.dsi.unimi.it/ubicrawler/ |
| 196 | | | | # added Yahoo! Slurp China http://misc.yahoo.com.cn/help.html |
| 197 | | | | # You must reprocess old logs for the Yahoo! Slurp China bot to be detected in old reports |
| 198 | | | | # 2006-05-20 |
| 199 | | | | # added 1-More Scanner http://www.myzips.com/software/1-More-Scanner.phtml |
| 200 | | | | # added Accoona-AI-Agent http://www.accoona.com/ |
| 201 | | | | # added ActiveBookmark http://www.libmaster.com/active_bookmark.php |
| 202 | | | | # added BIGLOTRON http://www.biglotron.com/robot.html |
| 203 | | | | # added Bookmark-Manager http://bkm.sourceforge.net/ |
| 204 | | | | # added cbn00glebot |
| 205 | | | | # added Cerberian Drtrs http://www.pgts.com.au/cgi-bin/psql?robot_info=25240 |
| 206 | | | | # added CFNetwork http://www.cocoadev.com/index.pl?CFNetwork |
| 207 | | | | # added CheckWeb link validator http://p.duby.free.fr/chkweb.htm |
| 208 | | | | # added Computer and Automation Research Institute Crawler http://www.ilab.sztaki.hu/~stamas/publications/p184-benczur.html |
| 209 | | | | # added ConveraCrawler http://www.authoritativeweb.com/crawl/ |
| 210 | | | | # added ConveraMultiMediaCrawler http://www.authoritativeweb.com/crawl/ |
| 211 | | | | # added CSE HTML Validator Lite Online http://online.htmlvalidator.com/php/onlinevallite.php |
| 212 | | | | # added Cursor http://adcenter.hu/docs/en/bot.html |
| 213 | | | | # added Custo http://www.netwu.com/custo/ |
| 214 | | | | # added DataFountains/DMOZ Downloader http://infomine.ucr.edu/ |
| 215 | | | | # added Deepindex http://www.deepindex.net/faq.php |
| 216 | | | | # added DNSGroup http://www.dnsgroup.com/ |
| 217 | | | | # added DoCoMo http://www.nttdocomo.co.jp/ |
| 218 | | | | # added dumm.de-Bot http://www.dumm.de/ |
| 219 | | | | # added ETS v http://www.freetranslation.com/help/ |
| 220 | | | | # added eventax http://www.eventax.de/ |
| 221 | | | | # added FAST Enterprise Crawler * crawleradmin.t-info@telekom.de http://www.telekom.de/ |
| 222 | | | | # added FAST Enterprise Crawler http://www.fast.no/ |
| 223 | | | | # added FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de http://www.telekom.de/ |
| 224 | | | | # added FeedValidator http://feedvalidator.org/ |
| 225 | | | | # added FilmkameraBot http://www.filmkamera.at/bot.html |
| 226 | | | | # added Findexa Crawler http://www.findexa.no/gulesider/article26548.ece |
| 227 | | | | # added Global Fetch http://www.wesonet.com/ |
| 228 | | | | # added GOFORITBOT http://www.goforit.com/about/ |
| 229 | | | | # added GoForIt.com http://www.goforit.com/about/ |
| 230 | | | | # added GPU p2p crawler http://gpu.sourceforge.net/search_engine.php |
| 231 | | | | # added HooWWWer http://cosco.hiit.fi/search/hoowwwer/ |
| 232 | | | | # added HPPrint |
| 233 | | | | # added HTMLParser http://htmlparser.sourceforge.net/ |
| 234 | | | | # added Hundesuche.com-Bot http://www.hundesuche.com/ |
| 235 | | | | # added InfoBot http://www.infobot.org/ |
| 236 | | | | # added InfociousBot http://corp.infocious.com/tech_crawler.php |
| 237 | | | | # added InternetSupervision http://internetsupervision.com/ |
| 238 | | | | # added isearch2006 http://www.yahoo.com.cn/ |
| 239 | | | | # added IUPUI_Research_Bot http://spamhuntress.com/2005/04/25/a-mail-harvester-visits/ |
| 240 | | | | # added KalamBot http://64.124.122.251/feedback.html |
| 241 | | | | # added kamano.de NewsFeedVerzeichnis http://www.kamano.de/ |
| 242 | | | | # added Kevin http://dznet.com/kevin/ |
| 243 | | | | # added KnowItAll http://www.cs.washington.edu/research/knowitall/ |
| 244 | | | | # added Knowledge.com http://www.knowledge.com/ |
| 245 | | | | # added Kouaa Krawler http://www.kouaa.com/ |
| 246 | | | | # added ksibot http://ego.ms.mff.cuni.cz/ |
| 247 | | | | # added Link Valet Online http://www.htmlhelp.com/tools/valet/ |
| 248 | | | | # added lwp-request http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request |
| 249 | | | | # added lwp-trivial http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm |
| 250 | | | | # added MapoftheInternet.com http://MapoftheInternet.com/ |
| 251 | | | | # added Matrix S.p.A. - FAST Enterprise Crawler http://tin.virgilio.it/ |
| 252 | | | | # added Megite http://www.megite.com/ |
| 253 | | | | # added Metaspinner http://index.meta-spinner.de/ |
| 254 | | | | # added Mini-reptile |
| 255 | | | | # added Misterbot http://www.misterbot.fr/ |
| 256 | | | | # added Miva http://www.miva.com/ |
| 257 | | | | # added Mizzu Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b |
| 258 | | | | # added MSRBOT http://research.microsoft.com/research/sv/msrbot/ |
| 259 | | | | # added MS SharePoint Portal Server - MS Search 4.0 Robot http://support.microsoft.com/default.aspx?scid=kb;en-us;284022 |
| 260 | | | | # added Mydoyouhike http://www.doyouhike.net/my |
| 261 | | | | # added NASA Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b |
| 262 | | | | # added NetSprint http://www.netsprint.pl/serwis/ |
| 263 | | | | # added NimbleCrawler http://www.healthline.com/ |
| 264 | | | | # added OpenWebSpider http://www.openwebspider.org/ |
| 265 | | | | # added Oracle Ultra Search http://www.oracle.com/technology/products/ultrasearch/index.html |
| 266 | | | | # added OSSProxy http://www.marketscore.com/FAQ.Aspx |
| 267 | | | | # added passwordmaker.org http://passwordmaker.org/ |
| 268 | | | | # added PEAR HTTP Request class http://pear.php.net/ |
| 269 | | | | # added PEERbot http://www.peerbot.com/ |
| 270 | | | | # added PHP version tracker http://www.nexen.net/phpversion/bot.php |
| 271 | | | | # added PictureOfInternet http://malfunction.org/poi/ |
| 272 | | | | # added plinki http://www.plinki.com/ |
| 273 | | | | # added Port Huron Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b |
| 274 | | | | # added PostFavorites http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b |
| 275 | | | | # added ProjectWF-java-test-crawler |
| 276 | | | | # added PyQuery http://sourceforge.net/projects/pyquery/ |
| 277 | | | | # added Schizozilla http://spamhuntress.com/2005/03/18/gizmo/ |
| 278 | | | | # added Scumbot |
| 279 | | | | # added Sensis Web Crawler http://www.sensis.com.au/ |
| 280 | | | | # added snap.com beta crawler http://www.snap.com/ |
| 281 | | | | # added Steeler http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ |
| 282 | | | | # added STEROID Download http://faqs.org.ru/progr/pascal/delphi_internet2.htm |
| 283 | | | | # added Suchfin-Bot http://www.suchfin.de/ |
| 284 | | | | # added Sunrise http://www.sunrisexp.com/ |
| 285 | | | | # added Tagyu Agent http://www.tagyu.com/ |
| 286 | | | | # added Tcl http client package http://www.tcl.tk/man/tcl8.4/TclCmd/http.htm |
| 287 | | | | # added TeragramCrawlerSURF http://www.teragram.com/ |
| 288 | | | | # added Test Crawler http://netp.ath.cx/ |
| 289 | | | | # added UnChaos Bot Hybrid Web Search Engine http://www.unchaos.com/ |
| 290 | | | | # added unido-bot http://www.unchina.org/unido/unido/our_projects/3_3.html |
| 291 | | | | # added UniversalFeedParser http://feedparser.org/ (seen from md301000.inktomisearch.com) |
| 292 | | | | # added updated http://www.updated.com/ |
| 293 | | | | # added Vermut http://vermut.aol.com |
| 294 | | | | # added versus crawler from eda.baykan@epfl.ch http://www.epfl.ch/Eindex.html |
| 295 | | | | # added Vespa Crawler (Yahoo Norway?) http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb |
| 296 | | | | # added VSE http://www.vivisimo.com/ |
| 297 | | | | # added webcrawl.net http://www.webcrawl.net/ |
| 298 | | | | # added Web Downloader http://www.krasu.ru/soft/chuchelo/ |
| 299 | | | | # added Webdup http://www.webdup.com/en/index.html |
| 300 | | | | # added Wells Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b |
| 301 | | | | # added WordPress http://wordpress.org/ |
| 302 | | | | # added wume crawler http://wume.cse.lehigh.edu/~xiq204/crawler/ |
| 303 | | | | # added Xenu's Link Sleuth (with ') |
| 304 | | | | # added xirq http://www.xirq.com/ |
| 305 | | | | # added yoogliFetchAgent http://www.yoogli.com/ |
| 306 | | | | # added Z-Add Link Checker http://w3.z-add.co.uk/linkcheck/ |
| 307 | | | | # -- fix - some robots were reported with _ where _ should have been a space. |
| 308 | | | | # changed Xenu Link Sleuth |
| 309 | | | | # changed microsoft\_url\_control -> microsoft\surl\scontrol |
| 310 | | | | # changed favorites\ssweeper -> favorites\ssweeper |
| 311 | | | | # -- updates |
| 312 | | | | # updated AskJeeves to Ask |
| 313 | | | | |
| 314 | | | | # to do MS Search 4.0 Robot |
| 315 | | | | |
| 316 | | | | #package AWSROB; |
| 317 | | | | |
| 318 | | | | |
| 319 | | | | # Robots list was found at http://www.robotstxt.org/wc/active/all.txt |
| 320 | | | | # Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html |
| 321 | | | | # Rem: To avoid bad detection, some robot's ids were removed from this list: |
| 322 | | | | # - Robots with ID of 3 letters only |
| 323 | | | | # - Robots called 'webs' and 'tcl' |
| 324 | | | | # Rem: Some robots mostly used for downloading have also been removed, i.e. wget |
| 325 | | | | # Rem: directhit changed into direct_hit (its real id) |
| 326 | | | | # Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser |
| 327 | | | | # Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser |
| 328 | | | | # Rem: roadrunner changed into road_runner |
| 329 | | | | # Rem: lycos changed to lycos_ to avoid confusion with lycos-online browser |
| 330 | | | | # Rem: voyager changed into ^voyager\/ to avoid to exclude voyager and amigavoyager browser |
| 331 | | | | |
| 332 | | | | # RobotsSearchIDOrder |
| 333 | | | | # It contains all matching criteria to search for in log fields. This list is |
| 334 | | | | # used to know in which order to search Robot IDs. |
| 335 | | | | # Most frequent ones are in list1, used when LevelForRobotsDetection is 1 or more |
| 336 | | | | # Minor robots are in list2, used when LevelForRobotsDetection is 2 or more |
| 337 | | | | # Note: Robots IDs are in lower case, ' ' and '+' are changed into '_' and are quoted. |
| 338 | | | | #------------------------------------------------------- |
| 339 | 1 | 2.1e-5 | 2.1e-5 | @RobotsSearchIDOrder_list1 = ( |
| 340 | | | | # Common robots (In robot file) |
| 341 | | | | 'appie', |
| 342 | | | | 'architext', |
| 343 | | | | 'jeeves', |
| 344 | | | | 'bjaaland', |
| 345 | | | | 'ferret', |
| 346 | | | | 'googlebot', |
| 347 | | | | 'google-sitemaps', |
| 348 | | | | 'gulliver', |
| 349 | | | | 'virus\_detector', # Must be before harvest |
| 350 | | | | 'harvest', |
| 351 | | | | 'htdig', |
| 352 | | | | 'linkwalker', |
| 353 | | | | 'lycos_', |
| 354 | | | | 'moget', |
| 355 | | | | 'muscatferret', |
| 356 | | | | 'myweb', |
| 357 | | | | 'nomad', |
| 358 | | | | 'scooter', |
| 359 | | | | 'yahoo!\sslurp\schina', # Must come before singluar slurp or yahoo |
| 360 | | | | 'slurp', |
| 361 | | | | '^voyager\/', |
| 362 | | | | 'weblayers', |
| 363 | | | | # Common robots (Not in robot file) |
| 364 | | | | 'antibot', |
| 365 | | | | 'bruinbot', |
| 366 | | | | 'digout4u', |
| 367 | | | | 'echo!', |
| 368 | | | | 'fast\-webcrawler', |
| 369 | | | | 'ia_archiver\-web\.archive\.org', # Must be before ia_archiver to avoid confusion with alexa |
| 370 | | | | 'ia_archiver', |
| 371 | | | | 'jennybot', |
| 372 | | | | 'mercator', |
| 373 | | | | 'netcraft', |
| 374 | | | | 'msnbot\-media', |
| 375 | | | | 'msnbot', |
| 376 | | | | 'petersnews', |
| 377 | | | | 'relevantnoise\.com', |
| 378 | | | | 'unlost_web_crawler', |
| 379 | | | | 'voila', |
| 380 | | | | 'webbase', |
| 381 | | | | 'webcollage', |
| 382 | | | | 'cfetch', |
| 383 | | | | 'zyborg', # Must be before wisenut |
| 384 | | | | 'wisenutbot' |
| 385 | | | | ); |
| 386 | 1 | 0.00019 | 0.00019 | @RobotsSearchIDOrder_list2 = ( |
| 387 | | | | # Less common robots (In robot file) |
| 388 | | | | '[^a]fish', |
| 389 | | | | 'abcdatos', |
| 390 | | | | 'acme\.spider', |
| 391 | | | | 'ahoythehomepagefinder', |
| 392 | | | | 'alkaline', |
| 393 | | | | 'anthill', |
| 394 | | | | 'arachnophilia', |
| 395 | | | | 'arale', |
| 396 | | | | 'araneo', |
| 397 | | | | 'aretha', |
| 398 | | | | 'ariadne', |
| 399 | | | | 'powermarks', |
| 400 | | | | 'arks', |
| 401 | | | | 'aspider', |
| 402 | | | | 'atn\.txt', |
| 403 | | | | 'atomz', |
| 404 | | | | 'auresys', |
| 405 | | | | 'backrub', |
| 406 | | | | 'bbot', |
| 407 | | | | 'bigbrother', |
| 408 | | | | 'blackwidow', |
| 409 | | | | 'blindekuh', |
| 410 | | | | 'bloodhound', |
| 411 | | | | 'borg\-bot', |
| 412 | | | | 'brightnet', |
| 413 | | | | 'bspider', |
| 414 | | | | 'cactvschemistryspider', |
| 415 | | | | 'calif[^r]', |
| 416 | | | | 'cassandra', |
| 417 | | | | 'cgireader', |
| 418 | | | | 'checkbot', |
| 419 | | | | 'christcrawler', |
| 420 | | | | 'churl', |
| 421 | | | | 'cienciaficcion', |
| 422 | | | | 'collective', |
| 423 | | | | 'combine', |
| 424 | | | | 'conceptbot', |
| 425 | | | | 'coolbot', |
| 426 | | | | 'core', |
| 427 | | | | 'cosmos', |
| 428 | | | | 'cruiser', |
| 429 | | | | 'cusco', |
| 430 | | | | 'cyberspyder', |
| 431 | | | | 'desertrealm', |
| 432 | | | | 'deweb', |
| 433 | | | | 'dienstspider', |
| 434 | | | | 'digger', |
| 435 | | | | 'diibot', |
| 436 | | | | 'direct_hit', |
| 437 | | | | 'dnabot', |
| 438 | | | | 'download_express', |
| 439 | | | | 'dragonbot', |
| 440 | | | | 'dwcp', |
| 441 | | | | 'e\-collector', |
| 442 | | | | 'ebiness', |
| 443 | | | | 'elfinbot', |
| 444 | | | | 'emacs', |
| 445 | | | | 'emcspider', |
| 446 | | | | 'esther', |
| 447 | | | | 'evliyacelebi', |
| 448 | | | | 'fastcrawler', |
| 449 | | | | 'fdse', |
| 450 | | | | 'felix', |
| 451 | | | | 'fetchrover', |
| 452 | | | | 'fido', |
| 453 | | | | 'finnish', |
| 454 | | | | 'fireball', |
| 455 | | | | 'fouineur', |
| 456 | | | | 'francoroute', |
| 457 | | | | 'freecrawl', |
| 458 | | | | 'funnelweb', |
| 459 | | | | 'gama', |
| 460 | | | | 'gazz', |
| 461 | | | | 'gcreep', |
| 462 | | | | 'getbot', |
| 463 | | | | 'geturl', |
| 464 | | | | 'golem', |
| 465 | | | | 'grapnel', |
| 466 | | | | 'griffon', |
| 467 | | | | 'gromit', |
| 468 | | | | 'gulperbot', |
| 469 | | | | 'hambot', |
| 470 | | | | 'havindex', |
| 471 | | | | 'hometown', |
| 472 | | | | 'htmlgobble', |
| 473 | | | | 'hyperdecontextualizer', |
| 474 | | | | 'iajabot', |
| 475 | | | | 'iconoclast', |
| 476 | | | | 'ilse', |
| 477 | | | | 'imagelock', |
| 478 | | | | 'incywincy', |
| 479 | | | | 'informant', |
| 480 | | | | 'infoseek', |
| 481 | | | | 'infoseeksidewinder', |
| 482 | | | | 'infospider', |
| 483 | | | | 'inspectorwww', |
| 484 | | | | 'intelliagent', |
| 485 | | | | 'irobot', |
| 486 | | | | 'iron33', |
| 487 | | | | 'israelisearch', |
| 488 | | | | 'javabee', |
| 489 | | | | 'jbot', |
| 490 | | | | 'jcrawler', |
| 491 | | | | 'jobo', |
| 492 | | | | 'jobot', |
| 493 | | | | 'joebot', |
| 494 | | | | 'jubii', |
| 495 | | | | 'jumpstation', |
| 496 | | | | 'kapsi', |
| 497 | | | | 'katipo', |
| 498 | | | | 'kilroy', |
| 499 | | | | 'ko_yappo_robot', |
| 500 | | | | 'kummhttp', |
| 501 | | | | 'labelgrabber\.txt', |
| 502 | | | | 'larbin', |
| 503 | | | | 'legs', |
| 504 | | | | 'linkidator', |
| 505 | | | | 'linkscan', |
| 506 | | | | 'lockon', |
| 507 | | | | 'logo_gif', |
| 508 | | | | 'macworm', |
| 509 | | | | 'magpie', |
| 510 | | | | 'marvin', |
| 511 | | | | 'mattie', |
| 512 | | | | 'mediafox', |
| 513 | | | | 'merzscope', |
| 514 | | | | 'meshexplorer', |
| 515 | | | | 'mindcrawler', |
| 516 | | | | 'mnogosearch', |
| 517 | | | | 'momspider', |
| 518 | | | | 'monster', |
| 519 | | | | 'motor', |
| 520 | | | | 'muncher', |
| 521 | | | | 'mwdsearch', |
| 522 | | | | 'ndspider', |
| 523 | | | | 'nederland\.zoek', |
| 524 | | | | 'netcarta', |
| 525 | | | | 'netmechanic', |
| 526 | | | | 'netscoop', |
| 527 | | | | 'newscan\-online', |
| 528 | | | | 'nhse', |
| 529 | | | | 'northstar', |
| 530 | | | | 'nzexplorer', |
| 531 | | | | 'objectssearch', |
| 532 | | | | 'occam', |
| 533 | | | | 'octopus', |
| 534 | | | | 'openfind', |
| 535 | | | | 'orb_search', |
| 536 | | | | 'packrat', |
| 537 | | | | 'pageboy', |
| 538 | | | | 'parasite', |
| 539 | | | | 'patric', |
| 540 | | | | 'pegasus', |
| 541 | | | | 'perignator', |
| 542 | | | | 'perlcrawler', |
| 543 | | | | 'phantom', |
| 544 | | | | 'phpdig', |
| 545 | | | | 'piltdownman', |
| 546 | | | | 'pimptrain', |
| 547 | | | | 'pioneer', |
| 548 | | | | 'pitkow', |
| 549 | | | | 'pjspider', |
| 550 | | | | 'plumtreewebaccessor', |
| 551 | | | | 'poppi', |
| 552 | | | | 'portalb', |
| 553 | | | | 'psbot', |
| 554 | | | | 'python', |
| 555 | | | | 'raven', |
| 556 | | | | 'rbse', |
| 557 | | | | 'resumerobot', |
| 558 | | | | 'rhcs', |
| 559 | | | | 'road_runner', |
| 560 | | | | 'robbie', |
| 561 | | | | 'robi', |
| 562 | | | | 'robocrawl', |
| 563 | | | | 'robofox', |
| 564 | | | | 'robozilla', |
| 565 | | | | 'roverbot', |
| 566 | | | | 'rules', |
| 567 | | | | 'safetynetrobot', |
| 568 | | | | 'search\-info', |
| 569 | | | | 'search_au', |
| 570 | | | | 'searchprocess', |
| 571 | | | | 'senrigan', |
| 572 | | | | 'sgscout', |
| 573 | | | | 'shaggy', |
| 574 | | | | 'shaihulud', |
| 575 | | | | 'sift', |
| 576 | | | | 'simbot', |
| 577 | | | | 'site\-valet', |
| 578 | | | | 'sitetech', |
| 579 | | | | 'skymob', |
| 580 | | | | 'slcrawler', |
| 581 | | | | 'smartspider', |
| 582 | | | | 'snooper', |
| 583 | | | | 'solbot', |
| 584 | | | | 'speedy', |
| 585 | | | | 'spider_monkey', |
| 586 | | | | 'spiderbot', |
| 587 | | | | 'spiderline', |
| 588 | | | | 'spiderman', |
| 589 | | | | 'spiderview', |
| 590 | | | | 'spry', |
| 591 | | | | 'sqworm', |
| 592 | | | | 'ssearcher', |
| 593 | | | | 'suke', |
| 594 | | | | 'sunrise', |
| 595 | | | | 'suntek', |
| 596 | | | | 'sven', |
| 597 | | | | 'tach_bw', |
| 598 | | | | 'tagyu\sagent', |
| 599 | | | | 'tarantula', |
| 600 | | | | 'tarspider', |
| 601 | | | | 'techbot', |
| 602 | | | | 'templeton', |
| 603 | | | | 'titan', |
| 604 | | | | 'titin', |
| 605 | | | | 'tkwww', |
| 606 | | | | 'tlspider', |
| 607 | | | | 'ucsd', |
| 608 | | | | 'udmsearch', |
| 609 | | | | 'universalfeedparser', |
| 610 | | | | 'urlck', |
| 611 | | | | 'valkyrie', |
| 612 | | | | 'verticrawl', |
| 613 | | | | 'victoria', |
| 614 | | | | 'visionsearch', |
| 615 | | | | 'voidbot', |
| 616 | | | | 'vwbot', |
| 617 | | | | 'w3index', |
| 618 | | | | 'w3m2', |
| 619 | | | | 'wallpaper', |
| 620 | | | | 'wanderer', |
| 621 | | | | 'wapspIRLider', |
| 622 | | | | 'webbandit', |
| 623 | | | | 'webcatcher', |
| 624 | | | | 'webcopy', |
| 625 | | | | 'webfetcher', |
| 626 | | | | 'webfoot', |
| 627 | | | | 'webinator', |
| 628 | | | | 'weblinker', |
| 629 | | | | 'webmirror', |
| 630 | | | | 'webmoose', |
| 631 | | | | 'webquest', |
| 632 | | | | 'webreader', |
| 633 | | | | 'webreaper', |
| 634 | | | | 'websnarf', |
| 635 | | | | 'webspider', |
| 636 | | | | 'webvac', |
| 637 | | | | 'webwalk', |
| 638 | | | | 'webwalker', |
| 639 | | | | 'webwatch', |
| 640 | | | | 'whatuseek', |
| 641 | | | | 'whowhere', |
| 642 | | | | 'wired\-digital', |
| 643 | | | | 'wmir', |
| 644 | | | | 'wolp', |
| 645 | | | | 'wombat', |
| 646 | | | | 'wordpress', |
| 647 | | | | 'worm', |
| 648 | | | | 'wwwc', |
| 649 | | | | 'wz101', |
| 650 | | | | 'xget', |
| 651 | | | | # Other robots reported by users |
| 652 | | | | '1\-more\sscanner', |
| 653 | | | | 'accoona\-ai\-agent', |
| 654 | | | | 'activebookmark', |
| 655 | | | | 'adamm\sbot', |
| 656 | | | | 'almaden', |
| 657 | | | | 'aipbot', |
| 658 | | | | 'aleadsoftbot', |
| 659 | | | | 'alpha\ssearch\sagent', |
| 660 | | | | 'aport', |
| 661 | | | | 'archive\.org_bot', |
| 662 | | | | 'argus', # Must be before nutch |
| 663 | | | | 'arianna\.libero\.it', |
| 664 | | | | 'aspseek', |
| 665 | | | | 'asterias', |
| 666 | | | | 'awbot', |
| 667 | | | | 'baiduspider', |
| 668 | | | | 'becomebot', |
| 669 | | | | 'bender', |
| 670 | | | | 'biglotron', |
| 671 | | | | 'bittorrent\sbot', |
| 672 | | | | 'biz360\sspider', |
| 673 | | | | 'blogbridge\sservice', |
| 674 | | | | 'bloglines', |
| 675 | | | | 'blogpulse',# added OpenWebSpider http://www.openwebspider.org/ |
| 676 | | | | # added NimbleCrawler http://www.healthline.com/ |
| 677 | | | | # added Mydoyouhike http://www.doyouhike.net/my |
| 678 | | | | # added PHP version tracker http://www.nexen.net/phpversion/bot.php |
| 679 | | | | # added kamano.de NewsFeedVerzeichnis http://www.kamano.de/ |
| 680 | | | | # added yoogliFetchAgent http://www.yoogli.com/ |
| 681 | | | | # added ETS v http://www.freetranslation.com/help/ |
| 682 | | | | # added HPPrint |
| 683 | | | | # added CFNetwork http://www.cocoadev.com/index.pl?CFNetwork |
| 684 | | | | # added STEROID Download http://faqs.org.ru/progr/pascal/delphi_internet2.htm |
| 685 | | | | # added OSSProxy http://www.marketscore.com/FAQ.Aspx |
| 686 | | | | # added Oracle Ultra Search http://www.oracle.com/technology/products/ultrasearch/index.html |
| 687 | | | | # added Web Downloader http://www.krasu.ru/soft/chuchelo/ |
| 688 | | | | # added lwp-request http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request |
| 689 | | | | # added Sunrise http://www.sunrisexp.com/ |
| 690 | | | | # added WordPress http://wordpress.org/ |
| 691 | | | | # added Global Fetch http://www.wesonet.com/ |
| 692 | | | | # added NASA Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b |
| 693 | | | | # added NetSprint http://www.netsprint.pl/serwis/ |
| 694 | | | | # added Webdup http://www.webdup.com/en/index.html |
| 695 | | | | # added Megite http://www.megite.com/ |
| 696 | | | | # added Mini-reptile |
| 697 | | | | # added lwp-trivial http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm |
| 698 | | | | # added HTMLParser http://htmlparser.sourceforge.net/ |
| 699 | | | | |
| 700 | | | | 'blogsearch', |
| 701 | | | | 'blogshares', |
| 702 | | | | 'blogslive', |
| 703 | | | | 'blogssay', |
| 704 | | | | 'bncf\.firenze\.sbn\.it\/raccolta\.txt', |
| 705 | | | | 'bobby', |
| 706 | | | | 'boitho\.com\-dc', |
| 707 | | | | 'bookmark\-manager', |
| 708 | | | | 'boris', |
| 709 | | | | 'bumblebee', |
| 710 | | | | 'candlelight\_favorites\_inspector', |
| 711 | | | | 'cbn00glebot', |
| 712 | | | | 'cerberian\sdrtrs', |
| 713 | | | | 'cfnetwork', |
| 714 | | | | 'cipinetbot', |
| 715 | | | | 'checkweb\slink\svalidator', |
| 716 | | | | 'commons\-httpclient', |
| 717 | | | | 'computer\sand\sautomation\sresearch\sinstitute\scrawler', |
| 718 | | | | 'converamultimediacrawler', |
| 719 | | | | 'converacrawler', |
| 720 | | | | 'cscrawler', |
| 721 | | | | 'cse\shtml\svalidator\slite\sonline', |
| 722 | | | | 'cuasarbot', |
| 723 | | | | 'cursor', |
| 724 | | | | 'custo', |
| 725 | | | | 'datafountains/dmoz\sdownloader', |
| 726 | | | | 'daviesbot', |
| 727 | | | | 'daypopbot', |
| 728 | | | | 'deepindex', |
| 729 | | | | 'dipsie\.bot', |
| 730 | | | | 'dnsgroup', |
| 731 | | | | 'docomo', |
| 732 | | | | 'domainchecker', |
| 733 | | | | 'domainsdb\.net', |
| 734 | | | | 'dulance', |
| 735 | | | | 'dumbot', |
| 736 | | | | 'dumm\.de\-bot', |
| 737 | | | | 'earthcom\.info', |
| 738 | | | | 'easydl', |
| 739 | | | | 'edgeio\-retriever', |
| 740 | | | | 'ets\sv', |
| 741 | | | | 'exactseek', |
| 742 | | | | 'extreme\_picture\_finder', |
| 743 | | | | 'eventax', |
| 744 | | | | 'everbeecrawler', |
| 745 | | | | 'everest\-vulcan', |
| 746 | | | | 'ezresult', |
| 747 | | | | 'enteprise', |
| 748 | | | | 'fast\senterprise\scrawler.*crawleradmin\.t\-info@telekom\.de', |
| 749 | | | | 'fast\senterprise\scrawler.*t\-info_bi_cluster\scrawleradmin\.t\-info@telekom\.de', |
| 750 | | | | 'matrix\ss\.p\.a\.\s\-\sfast\senterprise\scrawler', # must come before fast enterprise crawler |
| 751 | | | | 'fast\senterprise\scrawler', |
| 752 | | | | 'fast\-search\-engine', |
| 753 | | | | 'favorg', |
| 754 | | | | 'favorites\ssweeper', |
| 755 | | | | 'feedburner', |
| 756 | | | | 'feedfetcher\-google', |
| 757 | | | | 'feedflow', |
| 758 | | | | 'feedster', |
| 759 | | | | 'feedvalidator', |
| 760 | | | | 'filmkamerabot', |
| 761 | | | | 'findlinks', |
| 762 | | | | 'findexa\scrawler', |
| 763 | | | | 'fooky\.com\/ScorpionBot', |
| 764 | | | | 'g2crawler', |
| 765 | | | | 'gaisbot', |
| 766 | | | | 'geniebot', |
| 767 | | | | 'gigabot', |
| 768 | | | | 'girafabot', |
| 769 | | | | 'global\sfetch', |
| 770 | | | | 'gnodspider', |
| 771 | | | | 'goforit\.com', |
| 772 | | | | 'goforitbot', |
| 773 | | | | 'grub', |
| 774 | | | | 'gpu\sp2p\scrawler', |
| 775 | | | | 'henrythemiragorobot', |
| 776 | | | | 'heritrix', |
| 777 | | | | 'holmes', |
| 778 | | | | 'hoowwwer', |
| 779 | | | | 'hpprint', |
| 780 | | | | 'htmlparser', |
| 781 | | | | 'html\_link\_validator', |
| 782 | | | | 'httrack', |
| 783 | | | | 'hundesuche\.com\-bot', |
| 784 | | | | 'ichiro', |
| 785 | | | | 'iltrovatore\-setaccio', |
| 786 | | | | 'infobot', |
| 787 | | | | 'infociousbot', |
| 788 | | | | 'infomine', |
| 789 | | | | 'insurancobot', |
| 790 | | | | 'internet\_ninja', |
| 791 | | | | 'internetarchive', |
| 792 | | | | 'internetseer', |
| 793 | | | | 'internetsupervision', |
| 794 | | | | 'irlbot', |
| 795 | | | | 'isearch2006', |
| 796 | | | | 'iupui_research_bot', |
| 797 | | | | 'jrtwine\_software\_check\_favorites\_utility', |
| 798 | | | | 'justview', |
| 799 | | | | 'kalambot', |
| 800 | | | | 'kamano\.de\snewsfeedverzeichnis', |
| 801 | | | | 'kazoombot', |
| 802 | | | | 'kevin', |
| 803 | | | | 'keyoshid', # Must come before Y!J |
| 804 | | | | 'kinjabot', |
| 805 | | | | 'kinja\-imagebot', |
| 806 | | | | 'knowitall', |
| 807 | | | | 'knowledge\.com', |
| 808 | | | | 'kouaa\skrawler', |
| 809 | | | | 'krugle', |
| 810 | | | | 'ksibot', |
| 811 | | | | 'kurzor', |
| 812 | | | | 'lanshanbot', |
| 813 | | | | 'letscrawl\.com', |
| 814 | | | | 'libcrawl', |
| 815 | | | | 'linkbot', |
| 816 | | | | 'link\svalet\sonline', |
| 817 | | | | 'metager\-linkchecker', # Must be before linkchecker |
| 818 | | | | 'linkchecker', |
| 819 | | | | 'livejournal\.com', |
| 820 | | | | 'lmspider', |
| 821 | | | | 'lwp\-request', |
| 822 | | | | 'lwp\-trivial', |
| 823 | | | | 'magpierss', |
| 824 | | | | 'mapoftheinternet\.com', |
| 825 | | | | 'mediapartners\-google', |
| 826 | | | | 'megite', |
| 827 | | | | 'metaspinner', |
| 828 | | | | 'microsoft\surl\scontrol', |
| 829 | | | | 'mini\-reptile', |
| 830 | | | | 'minirank', |
| 831 | | | | 'missigua\slocator', |
| 832 | | | | 'misterbot', |
| 833 | | | | 'miva', |
| 834 | | | | 'mizzu\slabs', |
| 835 | | | | 'mj12bot', |
| 836 | | | | 'mojeekbot', |
| 837 | | | | 'tencenttraveler', # Must be before msiecrawler |
| 838 | | | | 'msiecrawler', |
| 839 | | | | 'ms\ssearch\s4\.0\srobot', |
| 840 | | | | 'msrabot', |
| 841 | | | | 'msrbot', |
| 842 | | | | 'mt::telegraph::agent', |
| 843 | | | | 'nagios', |
| 844 | | | | 'nasa\ssearch', |
| 845 | | | | 'mydoyouhike', |
| 846 | | | | 'netluchs', |
| 847 | | | | 'netsprint', |
| 848 | | | | 'newsgatoronline', |
| 849 | | | | 'nicebot', |
| 850 | | | | 'nimblecrawler', |
| 851 | | | | 'noxtrumbot', |
| 852 | | | | 'npbot', |
| 853 | | | | 'nutchcvs', |
| 854 | | | | 'nutchosu\-vlib', |
| 855 | | | | 'nutch', # Must come after other nutch versions |
| 856 | | | | 'ocelli', |
| 857 | | | | 'octora\sbeta\sbot', |
| 858 | | | | 'omniexplorer\_bot', |
| 859 | | | | 'onet\.pl\_sa', |
| 860 | | | | 'onfolio', |
| 861 | | | | 'opentaggerbot', |
| 862 | | | | 'openwebspider', |
| 863 | | | | 'oracle\sultra\ssearch', |
| 864 | | | | 'orbiter', |
| 865 | | | | 'outfoxbot', |
| 866 | | | | 'passwordmaker\.org', |
| 867 | | | | 'pear\shttp\srequest\sclass', |
| 868 | | | | 'peerbot', |
| 869 | | | | 'perman', |
| 870 | | | | 'php\_version\_tracker', |
| 871 | | | | 'php\sversion\stracker', |
| 872 | | | | 'pictureofinternet', |
| 873 | | | | 'ping\.blo\.gs', |
| 874 | | | | 'plinki', |
| 875 | | | | 'pluckfeedcrawler', |
| 876 | | | | 'pompos', |
| 877 | | | | 'popdexter', |
| 878 | | | | 'port\shuron\slabs', |
| 879 | | | | 'postfavorites', |
| 880 | | | | 'projectwf\-java\-test\-crawler', |
| 881 | | | | 'proodlebot', |
| 882 | | | | 'pyquery', |
| 883 | | | | 'rambler', |
| 884 | | | | 'redalert', |
| 885 | | | | 'rojo', |
| 886 | | | | 'rssimagesbot', |
| 887 | | | | 'ruffle', |
| 888 | | | | 'rufusbot', |
| 889 | | | | 'sandcrawler', |
| 890 | | | | 'sbider', |
| 891 | | | | 'schizozilla', |
| 892 | | | | 'scumbot', |
| 893 | | | | 'searchguild\_dmoz\_experiment', |
| 894 | | | | 'seekbot', |
| 895 | | | | 'sensis\sweb\scrawler', |
| 896 | | | | 'seznambot', |
| 897 | | | | 'shim\-crawler', |
| 898 | | | | 'shoutcast', |
| 899 | | | | 'slysearch', |
| 900 | | | | 'snap\.com\sbeta\scrawler', |
| 901 | | | | 'sogou\sspider', |
| 902 | | | | 'sogou\stest', |
| 903 | | | | 'sohu\-search', |
| 904 | | | | 'sohu', # "sohu agent" |
| 905 | | | | 'snappy', |
| 906 | | | | 'sphere\sscout', |
| 907 | | | | 'sproose\scrawler', |
| 908 | | | | 'steeler', |
| 909 | | | | 'steroid\s\sdownload', |
| 910 | | | | 'suchfin\-bot', |
| 911 | | | | 'superbot', |
| 912 | | | | 'surveybot', |
| 913 | | | | 'susie', |
| 914 | | | | 'syndic8', |
| 915 | | | | 'syndicapi', |
| 916 | | | | 'synoobot', |
| 917 | | | | 'tcl\shttp\sclient\spackage', |
| 918 | | | | 'technoratibot', |
| 919 | | | | 'teragramcrawlersurf', |
| 920 | | | | 'test\scrawler', |
| 921 | | | | 'testbot', |
| 922 | | | | 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e', |
| 923 | | | | 'topicblogs', |
| 924 | | | | 'turnitinbot', |
| 925 | | | | 'turtlescanner', # Must be before turtle |
| 926 | | | | 'turtle', |
| 927 | | | | 'tutorgigbot', |
| 928 | | | | 'ubicrawler', |
| 929 | | | | 'ultraseek', |
| 930 | | | | 'unchaos\sbot\shybrid\sweb\ssearch\sengine', |
| 931 | | | | 'unido\-bot', |
| 932 | | | | 'up\.browser', |
| 933 | | | | 'updated', |
| 934 | | | | 'ustc\-semantic\-group', |
| 935 | | | | 'vagabondo\-wap', |
| 936 | | | | 'vagabondo', |
| 937 | | | | 'vermut', |
| 938 | | | | 'versus\scrawler\sfrom\seda\.baykan@epfl\.ch', |
| 939 | | | | 'vespa\scrawler', |
| 940 | | | | 'vortex', |
| 941 | | | | 'vse', |
| 942 | | | | 'w3c\-checklink', |
| 943 | | | | 'w3c\_css\_validator\_jfouffa', |
| 944 | | | | 'w3c_validator', |
| 945 | | | | 'wavefire', |
| 946 | | | | 'webclipping\.com', |
| 947 | | | | 'webcompass', |
| 948 | | | | 'webcrawl\.net', |
| 949 | | | | 'web\sdownloader', |
| 950 | | | | 'webdup', |
| 951 | | | | 'webfilter', |
| 952 | | | | 'webindexer', |
| 953 | | | | 'webminer', |
| 954 | | | | 'website\_monitoring\_bot', |
| 955 | | | | 'webvulncrawl', |
| 956 | | | | 'wells\ssearch', |
| 957 | | | | 'wonderer', |
| 958 | | | | 'wume\scrawler', |
| 959 | | | | 'wwweasel', |
| 960 | | | | 'xenu\'s\slink\ssleuth', |
| 961 | | | | 'xenu\slink\ssleuth', |
| 962 | | | | 'xirq', |
| 963 | | | | 'y!j', # Must come after keyoshid Y!J |
| 964 | | | | 'yacy', |
| 965 | | | | 'yahoo\-blogs', |
| 966 | | | | 'yahoo\-verticalcrawler', |
| 967 | | | | 'yahoofeedseeker', |
| 968 | | | | 'yahooseeker\-testing', |
| 969 | | | | 'yahooseeker', |
| 970 | | | | 'yahoo\-mmcrawler', |
| 971 | | | | 'yahoo!\smindset', |
| 972 | | | | 'yandex', |
| 973 | | | | 'yooglifetchagent', |
| 974 | | | | 'z\-add\slink\schecker', |
| 975 | | | | 'zealbot', |
| 976 | | | | 'zspider', |
| 977 | | | | 'zeus', |
| 978 | | | | 'ng\/1\.', # put at end to avoid false positive |
| 979 | | | | 'ng\/2\.', # put at end to avoid false positive |
| 980 | | | | 'exabot', # put at end to avoid false positive |
| 981 | | | | 'java' # put at end to avoid false positive |
| 982 | | | | ); |
| 983 | 1 | 3.0e-6 | 3.0e-6 | @RobotsSearchIDOrder_listgen = ( |
| 984 | | | | # Generic robot |
| 985 | | | | 'robot', |
| 986 | | | | 'crawl', |
| 987 | | | | 'spider', |
| 988 | | | | '\wbot[\/\-]' |
| 989 | | | | ); |
| 990 | | | | |
| 991 | | | | |
| 992 | | | | |
| 993 | | | | # RobotsHashIDLib |
| 994 | | | | # List of robots names ('robot id','robot clear text') |
| 995 | | | | #------------------------------------------------------- |
| 996 | 1 | 0.00102 | 0.00102 | %RobotsHashIDLib = ( |
| 997 | | | | # Common robots (In robot file) |
| 998 | | | | 'appie','<a href="http://www.walhello.com/" title="Bot home page [new window]" target="_blank">Walhello appie</a>', |
| 999 | | | | 'architext','ArchitextSpider', |
| 1000 | | | | 'jeeves','<a href="http://sp.ask.com/docs/about/tech_crawling.html" title="Bot home page [new window]" target="_blank">Ask</a>', |
| 1001 | | | | 'bjaaland','Bjaaland', |
| 1002 | | | | 'ferret','Wild Ferret Web Hopper #1, #2, #3', |
| 1003 | | | | 'googlebot','<a href="http://www.google.com/bot.html" title="Bot home page [new window]" target="_blank">Googlebot</a>', |
| 1004 | | | | 'google-sitemaps', 'Google Sitemaps', |
| 1005 | | | | 'gulliver','Northern Light Gulliver', |
| 1006 | | | | 'virus\_detector','<a href="http://www.securecomputing.com/" title="virus_harvester@securecomputing.com; Bot home page [new window]" target="_blank">virus_detector</a>', |
| 1007 | | | | 'harvest','Harvest', |
| 1008 | | | | 'htdig','ht://Dig', |
| 1009 | | | | 'linkwalker','LinkWalker', |
| 1010 | | | | 'lycos_','Lycos', |
| 1011 | | | | 'moget','moget', |
| 1012 | | | | 'muscatferret','Muscat Ferret', |
| 1013 | | | | 'myweb','Internet Shinchakubin', |
| 1014 | | | | 'nomad','Nomad', |
| 1015 | | | | 'scooter','Scooter', |
| 1016 | | | | 'yahoo!\sslurp\schina','<a href="http://misc.yahoo.com.cn/help.html" title="Bot home page [new window]" target="_blank">Yahoo! Slurp China</a>', |
| 1017 | | | | 'slurp','<a href="http://help.yahoo.com/help/us/ysearch/slurp/" title="Bot home page [new window]" target="_blank">Yahoo Slurp</a>', |
| 1018 | | | | '^voyager\/','Voyager', |
| 1019 | | | | 'weblayers','Weblayers', |
| 1020 | | | | # Common robots (Not in robot file) |
| 1021 | | | | 'antibot','Antibot', |
| 1022 | | | | 'bruinbot','<a href="http://web.archive.org/" title="BruinBot home page [new window]" target="_blank">The web archive</a>', |
| 1023 | | | | 'digout4u','Digout4u', |
| 1024 | | | | 'echo!','EchO!', |
| 1025 | | | | 'fast\-webcrawler','Fast-Webcrawler', |
| 1026 | | | | 'ia_archiver\-web\.archive\.org','<a href="http://web.archive.org/" title="Bot home page [new window]" target="_blank">The web archive (IA Archiver)</a>', |
| 1027 | | | | 'ia_archiver','<a href="http://www.alexa.com/" title="Bot home page [new window]" target="_blank">Alexa (IA Archiver)</a>', |
| 1028 | | | | 'jennybot','JennyBot', |
| 1029 | | | | 'mercator','Mercator', |
| 1030 | | | | 'msnbot\-media','<a href="http://search.msn.com/msnbot.htm" title="Bot home page [new window]" target="_blank">MSNBot-media</a>', |
| 1031 | | | | 'msnbot','<a href="http://search.msn.com/msnbot.htm" title="Bot home page [new window]" target="_blank">MSNBot</a>', |
| 1032 | | | | 'netcraft','<a href="http://www.netcraft.com/survey/" title="Bot home page [new window]" target="_blank">Netcraft</a>', |
| 1033 | | | | 'petersnews','Petersnews', |
| 1034 | | | | 'unlost_web_crawler','Unlost Web Crawler', |
| 1035 | | | | 'voila','Voila', |
| 1036 | | | | 'webbase', 'WebBase', |
| 1037 | | | | 'zyborg','<a href="http://www.WISEnutbot.com/" title="wn-14.zyborg@looksmart.net Bot home page [new window]" target="_blank">ZyBorg</a>', |
| 1038 | | | | 'wisenutbot','<a href="http://www.WISEnutbot.com/" title="Bot home page [new window]" target="_blank">WISENutbot</a>', |
| 1039 | | | | 'webcollage','<a href="http://www.jwz.org/webcollage/" title="WebCollage home page [new window]" target="_blank">WebCollage</a>', |
| 1040 | | | | 'cfetch','<a href="http://www.kosmix.com/crawler.html" title="kosmix home page [new window]" target="_blank">Cfetch</a>', |
| 1041 | | | | # Less common robots (In robot file) |
| 1042 | | | | '[^a]fish','Fish search', |
| 1043 | | | | 'abcdatos','ABCdatos BotLink', |
| 1044 | | | | 'acme\.spider','Acme.Spider', |
| 1045 | | | | 'ahoythehomepagefinder','Ahoy! The Homepage Finder', |
| 1046 | | | | 'alkaline','Alkaline', |
| 1047 | | | | 'anthill','Anthill', |
| 1048 | | | | 'arachnophilia','Arachnophilia', |
| 1049 | | | | 'arale','Arale', |
| 1050 | | | | 'araneo','Araneo', |
| 1051 | | | | 'aretha','Aretha', |
| 1052 | | | | 'ariadne','ARIADNE', |
| 1053 | | | | 'powermarks','<a href="http://www.kaylon.com/power.html" title="Bot home page [new window]" target="_blank">Powermarks</a>', # must come before Arks; seen used by referrer spam |
| 1054 | | | | 'arks','arks', |
| 1055 | | | | 'aspider','ASpider (Associative Spider)', |
| 1056 | | | | 'atn\.txt','ATN Worldwide', |
| 1057 | | | | 'atomz','Atomz.com Search Robot', |
| 1058 | | | | 'auresys','AURESYS', |
| 1059 | | | | 'backrub','BackRub', |
| 1060 | | | | 'bbot','BBot', |
| 1061 | | | | 'bigbrother','Big Brother', |
| 1062 | | | | 'blackwidow','BlackWidow', |
| 1063 | | | | 'blindekuh','Die Blinde Kuh', |
| 1064 | | | | 'bloodhound','Bloodhound', |
| 1065 | | | | 'borg\-bot','Borg-Bot', |
| 1066 | | | | 'brightnet','bright.net caching robot', |
| 1067 | | | | 'bspider','BSpider', |
| 1068 | | | | 'cactvschemistryspider','CACTVS Chemistry Spider', |
| 1069 | | | | 'calif[^r]','Calif', |
| 1070 | | | | 'cassandra','Cassandra', |
| 1071 | | | | 'cgireader','Digimarc Marcspider/CGI', |
| 1072 | | | | 'checkbot','Checkbot', |
| 1073 | | | | 'christcrawler','ChristCrawler.com', |
| 1074 | | | | 'churl','churl', |
| 1075 | | | | 'cienciaficcion','cIeNcIaFiCcIoN.nEt', |
| 1076 | | | | 'collective','Collective', |
| 1077 | | | | 'combine','Combine System', |
| 1078 | | | | 'conceptbot','Conceptbot', |
| 1079 | | | | 'coolbot','CoolBot', |
| 1080 | | | | 'core','Web Core / Roots', |
| 1081 | | | | 'cosmos','XYLEME Robot', |
| 1082 | | | | 'cruiser','Internet Cruiser Robot', |
| 1083 | | | | 'cusco','Cusco', |
| 1084 | | | | 'cyberspyder','CyberSpyder Link Test', |
| 1085 | | | | 'desertrealm','Desert Realm Spider', |
| 1086 | | | | 'deweb','DeWeb(c) Katalog/Index', |
| 1087 | | | | 'dienstspider','DienstSpider', |
| 1088 | | | | 'digger','Digger', |
| 1089 | | | | 'diibot','Digital Integrity Robot', |
| 1090 | | | | 'direct_hit','Direct Hit Grabber', |
| 1091 | | | | 'dnabot','DNAbot', |
| 1092 | | | | 'download_express','DownLoad Express', |
| 1093 | | | | 'dragonbot','DragonBot', |
| 1094 | | | | 'dwcp','DWCP (Dridus\' Web Cataloging Project)', |
| 1095 | | | | 'e\-collector','e-collector', |
| 1096 | | | | 'ebiness','EbiNess', |
| 1097 | | | | 'elfinbot','ELFINBOT', |
| 1098 | | | | 'emacs','Emacs-w3 Search Engine', |
| 1099 | | | | 'emcspider','ananzi', |
| 1100 | | | | 'esther','Esther', |
| 1101 | | | | 'evliyacelebi','Evliya Celebi', |
| 1102 | | | | 'fastcrawler','FastCrawler', |
| 1103 | | | | 'fdse','Fluid Dynamics Search Engine robot', |
| 1104 | | | | 'felix','Felix IDE', |
| 1105 | | | | 'fetchrover','FetchRover', |
| 1106 | | | | 'fido','fido', |
| 1107 | | | | 'finnish','H���ki', |
| 1108 | | | | 'fireball','KIT-Fireball', |
| 1109 | | | | 'fouineur','Fouineur', |
| 1110 | | | | 'francoroute','Robot Francoroute', |
| 1111 | | | | 'freecrawl','Freecrawl', |
| 1112 | | | | 'funnelweb','FunnelWeb', |
| 1113 | | | | 'gama','gammaSpider, FocusedCrawler', |
| 1114 | | | | 'gazz','gazz', |
| 1115 | | | | 'gcreep','GCreep', |
| 1116 | | | | 'getbot','GetBot', |
| 1117 | | | | 'geturl','GetURL', |
| 1118 | | | | 'golem','Golem', |
| 1119 | | | | 'grapnel','Grapnel/0.01 Experiment', |
| 1120 | | | | 'griffon','Griffon', |
| 1121 | | | | 'gromit','Gromit', |
| 1122 | | | | 'gulperbot','Gulper Bot', |
| 1123 | | | | 'hambot','HamBot', |
| 1124 | | | | 'havindex','havIndex', |
| 1125 | | | | 'hometown','Hometown Spider Pro', |
| 1126 | | | | 'htmlgobble','HTMLgobble', |
| 1127 | | | | 'hyperdecontextualizer','Hyper-Decontextualizer', |
| 1128 | | | | 'iajabot','iajaBot', |
| 1129 | | | | 'iconoclast','Popular Iconoclast', |
| 1130 | | | | 'ilse','Ingrid', |
| 1131 | | | | 'imagelock','Imagelock', |
| 1132 | | | | 'incywincy','IncyWincy', |
| 1133 | | | | 'informant','Informant', |
| 1134 | | | | 'infoseek','InfoSeek Robot 1.0', |
| 1135 | | | | 'infoseeksidewinder','Infoseek Sidewinder', |
| 1136 | | | | 'infospider','InfoSpiders', |
| 1137 | | | | 'inspectorwww','Inspector Web', |
| 1138 | | | | 'intelliagent','IntelliAgent', |
| 1139 | | | | 'irobot','I, Robot', |
| 1140 | | | | 'iron33','Iron33', |
| 1141 | | | | 'israelisearch','Israeli-search', |
| 1142 | | | | 'javabee','JavaBee', |
| 1143 | | | | 'jbot','JBot Java Web Robot', |
| 1144 | | | | 'jcrawler','JCrawler', |
| 1145 | | | | 'jobo','JoBo Java Web Robot', |
| 1146 | | | | 'jobot','Jobot', |
| 1147 | | | | 'joebot','JoeBot', |
| 1148 | | | | 'jubii','The Jubii Indexing Robot', |
| 1149 | | | | 'jumpstation','JumpStation', |
| 1150 | | | | 'kapsi','image.kapsi.net', |
| 1151 | | | | 'katipo','Katipo', |
| 1152 | | | | 'kilroy','Kilroy', |
| 1153 | | | | 'ko_yappo_robot','KO_Yappo_Robot', |
| 1154 | | | | 'kummhttp','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b" title="Bot documentation page [new window]" target="_blank">KummHttp</a>', |
| 1155 | | | | 'labelgrabber\.txt','LabelGrabber', |
| 1156 | | | | 'larbin','<a href="http://para.inria.fr/~ailleret/larbin/index-eng.html" title="Bot home page [new window]" target="_blank">larbin</a>', |
| 1157 | | | | 'legs','legs', |
| 1158 | | | | 'linkidator','Link Validator', |
| 1159 | | | | 'linkscan','LinkScan', |
| 1160 | | | | 'lockon','Lockon', |
| 1161 | | | | 'logo_gif','logo.gif Crawler', |
| 1162 | | | | 'macworm','Mac WWWWorm', |
| 1163 | | | | 'lmspider','<a href="http://www.nuance.com/" title="Bot home page lmspider@scansoft.com [new window]" target="_blank">lmspider</a>', |
| 1164 | | | | 'lwp\-request','<a href="http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request" title="lwp-request home page [new window]" target="_blank">lwp-request</a>', |
| 1165 | | | | 'lwp\-trivial','<a href="http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm" title="lwp-trivial home page [new window]" target="_blank">lwp-trivial</a>', |
| 1166 | | | | 'magpie','<a href="http://magpierss.sf.net/" title="Bot home page [new window]" target="_blank">MagpieRSS</a>', |
| 1167 | | | | 'marvin','marvin/infoseek', |
| 1168 | | | | 'mattie','Mattie', |
| 1169 | | | | 'mediafox','MediaFox', |
| 1170 | | | | 'merzscope','MerzScope', |
| 1171 | | | | 'meshexplorer','NEC-MeshExplorer', |
| 1172 | | | | 'mindcrawler','MindCrawler', |
| 1173 | | | | 'mnogosearch','mnoGoSearch search engine software', |
| 1174 | | | | 'momspider','MOMspider', |
| 1175 | | | | 'monster','Monster', |
| 1176 | | | | 'motor','Motor', |
| 1177 | | | | 'muncher','Muncher', |
| 1178 | | | | 'mwdsearch','Mwd.Search', |
| 1179 | | | | 'ndspider','NDSpider', |
| 1180 | | | | 'nederland\.zoek','Nederland.zoek', |
| 1181 | | | | 'netcarta','NetCarta WebMap Engine', |
| 1182 | | | | 'netmechanic','<a href="http://www.netmechanic.com/" title="Bot home page [new window]" target="_blank">NetMechanic</a>', |
| 1183 | | | | 'netscoop','NetScoop', |
| 1184 | | | | 'newscan\-online','newscan-online', |
| 1185 | | | | 'nhse','NHSE Web Forager', |
| 1186 | | | | 'northstar','The NorthStar Robot', |
| 1187 | | | | 'nzexplorer','nzexplorer', |
| 1188 | | | | 'objectssearch','ObjectsSearch', |
| 1189 | | | | 'occam','Occam', |
| 1190 | | | | 'octopus','HKU WWW Octopus', |
| 1191 | | | | 'openfind','Openfind data gatherer', |
| 1192 | | | | 'orb_search','Orb Search', |
| 1193 | | | | 'packrat','Pack Rat', |
| 1194 | | | | 'pageboy','PageBoy', |
| 1195 | | | | 'parasite','ParaSite', |
| 1196 | | | | 'patric','Patric', |
| 1197 | | | | 'pegasus','pegasus', |
| 1198 | | | | 'perignator','The Peregrinator', |
| 1199 | | | | 'perlcrawler','PerlCrawler 1.0', |
| 1200 | | | | 'phantom','Phantom', |
| 1201 | | | | 'phpdig','PhpDig', |
| 1202 | | | | 'piltdownman','PiltdownMan', |
| 1203 | | | | 'pimptrain','Pimptrain.com\'s robot', |
| 1204 | | | | 'pioneer','Pioneer', |
| 1205 | | | | 'pitkow','html_analyzer', |
| 1206 | | | | 'pjspider','Portal Juice Spider', |
| 1207 | | | | 'plumtreewebaccessor','PlumtreeWebAccessor', |
| 1208 | | | | 'poppi','Poppi', |
| 1209 | | | | 'portalb','PortalB Spider', |
| 1210 | | | | 'psbot','<a href="http://www.picsearch.com/bot.html" title="Bot home page. [new window]" target="_blank">psbot</a>', |
| 1211 | | | | 'python','<a href="http://www.lib.uchicago.edu/keith/courses/python/class/7/" title="Bot home page. Used by many. [new window]" target="_blank">Python-urllib</a>', |
| 1212 | | | | 'raven','Raven Search', |
| 1213 | | | | 'rbse','RBSE Spider', |
| 1214 | | | | 'resumerobot','Resume Robot', |
| 1215 | | | | 'rhcs','RoadHouse Crawling System', |
| 1216 | | | | 'road_runner','Road Runner: The ImageScape Robot', |
| 1217 | | | | 'robbie','Robbie the Robot', |
| 1218 | | | | 'robi','ComputingSite Robi/1.0', |
| 1219 | | | | 'robocrawl','RoboCrawl Spider', |
| 1220 | | | | 'robofox','RoboFox', |
| 1221 | | | | 'robozilla','Robozilla', |
| 1222 | | | | 'roverbot','Roverbot', |
| 1223 | | | | 'rules','RuLeS', |
| 1224 | | | | 'safetynetrobot','SafetyNet Robot', |
| 1225 | | | | 'search\-info','Sleek', |
| 1226 | | | | 'search_au','Search.Aus-AU.COM', |
| 1227 | | | | 'searchprocess','SearchProcess', |
| 1228 | | | | 'senrigan','Senrigan', |
| 1229 | | | | 'sgscout','SG-Scout', |
| 1230 | | | | 'shaggy','ShagSeeker', |
| 1231 | | | | 'shaihulud','Shai\'Hulud', |
| 1232 | | | | 'sift','Sift', |
| 1233 | | | | 'simbot','Simmany Robot Ver1.0', |
| 1234 | | | | 'site\-valet','Site Valet', |
| 1235 | | | | 'sitetech','SiteTech-Rover', |
| 1236 | | | | 'skymob','Skymob.com', |
| 1237 | | | | 'slcrawler','SLCrawler', |
| 1238 | | | | 'smartspider','Smart Spider', |
| 1239 | | | | 'snooper','Snooper', |
| 1240 | | | | 'solbot','Solbot', |
| 1241 | | | | 'speedy','Speedy Spider', |
| 1242 | | | | 'spider_monkey','spider_monkey', |
| 1243 | | | | 'spiderbot','SpiderBot', |
| 1244 | | | | 'spiderline','Spiderline Crawler', |
| 1245 | | | | 'spiderman','SpiderMan', |
| 1246 | | | | 'spiderview','SpiderView(tm)', |
| 1247 | | | | 'spry','Spry Wizard Robot', |
| 1248 | | | | 'ssearcher','Site Searcher', |
| 1249 | | | | 'sqworm','<a href="http://www.websense.com/" title="Bot home page (source: http://www.pgts.com.au/) [new window]" target="_blank">Sqworm</a>', |
| 1250 | | | | 'suke','Suke', |
| 1251 | | | | 'sunrise','<a href="http://www.sunrisexp.com/" title="Sunrise home page [new window]" target="_blank">Sunrise</a>', |
| 1252 | | | | 'suntek','suntek search engine', |
| 1253 | | | | 'sven','Sven', |
| 1254 | | | | 'tach_bw','TACH Black Widow', |
| 1255 | | | | 'tagyu\sagent','<a href="http://www.tagyu.com/" title="Bot home page [new window]" target="_blank">Tagyu Agent</a>', |
| 1256 | | | | 'tarantula','Tarantula', |
| 1257 | | | | 'tarspider','tarspider', |
| 1258 | | | | 'techbot','TechBOT', |
| 1259 | | | | 'templeton','Templeton', |
| 1260 | | | | 'titan','TITAN', |
| 1261 | | | | 'titin','TitIn', |
| 1262 | | | | 'tkwww','The TkWWW Robot', |
| 1263 | | | | 'tlspider','TLSpider', |
| 1264 | | | | 'ucsd','UCSD Crawl', |
| 1265 | | | | 'udmsearch','UdmSearch', |
| 1266 | | | | 'universalfeedparser','<a href="http://feedparser.org/" title="Bot home page [new window]" target="_blank">UniversalFeedParser</a>', |
| 1267 | | | | 'urlck','URL Check', |
| 1268 | | | | 'valkyrie','Valkyrie', |
| 1269 | | | | 'verticrawl','Verticrawl', |
| 1270 | | | | 'victoria','Victoria', |
| 1271 | | | | 'visionsearch','vision-search', |
| 1272 | | | | 'voidbot','void-bot', |
| 1273 | | | | 'vwbot','VWbot', |
| 1274 | | | | 'w3index','The NWI Robot', |
| 1275 | | | | 'w3m2','W3M2', |
| 1276 | | | | 'wallpaper','WallPaper (alias crawlpaper)', |
| 1277 | | | | 'wanderer','the World Wide Web Wanderer', |
| 1278 | | | | 'wapspider','w@pSpider by wap4.com', |
| 1279 | | | | 'webbandit','WebBandit Web Spider', |
| 1280 | | | | 'webcatcher','WebCatcher', |
| 1281 | | | | 'webcopy','WebCopy', |
| 1282 | | | | 'webfetcher','webfetcher', |
| 1283 | | | | 'webfoot','The Webfoot Robot', |
| 1284 | | | | 'webinator','Webinator', |
| 1285 | | | | 'weblinker','WebLinker', |
| 1286 | | | | 'webmirror','WebMirror', |
| 1287 | | | | 'webmoose','The Web Moose', |
| 1288 | | | | 'webquest','WebQuest', |
| 1289 | | | | 'webreader','Digimarc MarcSpider', |
| 1290 | | | | 'webreaper','WebReaper', |
| 1291 | | | | 'websnarf','Websnarf', |
| 1292 | | | | 'webspider','WebSpider', |
| 1293 | | | | 'webvac','WebVac', |
| 1294 | | | | 'webwalk','webwalk', |
| 1295 | | | | 'webwalker','WebWalker', |
| 1296 | | | | 'webwatch','WebWatch', |
| 1297 | | | | 'whatuseek','whatUseek Winona', |
| 1298 | | | | 'whowhere','WhoWhere Robot', |
| 1299 | | | | 'wired\-digital','Wired Digital', |
| 1300 | | | | 'wmir','w3mir', |
| 1301 | | | | 'wolp','WebStolperer', |
| 1302 | | | | 'wombat','The Web Wombat', |
| 1303 | | | | 'wordpress','<a href="http://wordpress.org/" title="WordPress home page [new window]" target="_blank">WordPress</a>', |
| 1304 | | | | 'worm','The World Wide Web Worm', |
| 1305 | | | | 'wwwc','WWWC Ver 0.2.5', |
| 1306 | | | | 'wz101','WebZinger', |
| 1307 | | | | 'xget','XGET', |
| 1308 | | | | # Other robots reported by users |
| 1309 | | | | '1\-more\sscanner','<a href="http://www.myzips.com/software/1-More-Scanner.phtml" title="1-More Scanner home page [new window]" target="_blank">1-More Scanner</a>', |
| 1310 | | | | 'accoona\-ai\-agent','<a href="http://www.accoona.com/" title="Accoona-AI-Agent home page [new window]" target="_blank">Accoona-AI-Agent</a>', |
| 1311 | | | | 'activebookmark','<a href="http://www.libmaster.com/active_bookmark.php" title="ActiveBookmark home page [new window]" target="_blank">ActiveBookmark</a>', |
| 1312 | | | | 'adamm\sbot','<a href="http://home.blic.net/adamm/" title="Bot home page [new window]" target="_blank">AdamM Bot</a>', |
| 1313 | | | | 'almaden','<a href="http://www.almaden.ibm.com/cs/crawler" title="IBM Almaden Research Center WebFountain™ Bot home page [new window]" target="_blank">IBM Almaden</a> Research Center WebFountain™', |
| 1314 | | | | 'aipbot','<a href="http://www.aipbot.com/" title="aipbot@aipbot.com Bot home page [new window]" target="_blank">aipbot</a>', |
| 1315 | | | | 'aleadsoftbot','<a href="http://www.aleadsoft.com/bot.htm" title="ALeadSoftbot home page [new window]" target="_blank">ALeadSoftbot</a>', |
| 1316 | | | | 'alpha\ssearch\sagent','Alpha Search Agent', |
| 1317 | | | | 'aport', 'Aport', |
| 1318 | | | | 'archive\.org_bot','<a href="http://crawls.archive.org/collections/bncf/crawl.html" title="Bot home page [new window]" target="_blank">archive.org bot</a>', |
| 1319 | | | | 'argus','<a href="http://www.simpy.com/bot.html" title="feedback@simpy.com Bot home page [new window]" target="_blank">Argus</a>', |
| 1320 | | | | 'arianna\.libero\.it','<a href="http://arianna.libero.it/" title="Bot home page [new window]" target="_blank">arianna.libero.it</a>', |
| 1321 | | | | 'aspseek','<a href="http://www.aspseek.org/" title="Bot home page [new window]" target="_blank">ASPseek</a>', |
| 1322 | | | | 'asterias', 'Asterias', |
| 1323 | | | | 'awbot', 'AWBot', |
| 1324 | | | | 'baiduspider','<a href="http://www.baidu.com/search/spider.html" title="Bot home page [new window]" target="_blank">BaiDuSpider</a>', |
| 1325 | | | | 'becomebot', '<a href="http://www.become.com/site_owners.html" title="Bot home page [new window]" target="_blank">BecomeBot</a>', |
| 1326 | | | | 'bender','<a href="http://bender.ucr.edu/" title="Bot home page [new window]" target="_blank">bender</a> <a href="http://ivia.ucr.edu/manuals/NiFC/current/index.shtml" title="Bot home page [new window]" target="_blank">focused_crawler</a>', |
| 1327 | | | | 'biglotron','<a href="http://www.biglotron.com/robot.html" title="Bot home page [new window]" target="_blank">Biglotron</a>', |
| 1328 | | | | 'bittorrent\sbot','<a href="http://www.bittorrent.com/" title="Bot home page [new window]" target="_blank">BitTorrent Bot</a>', |
| 1329 | | | | 'biz360\sspider','<a href="http://www.biz360.com/" title="blogsmanager@biz360.com Bot home page [new window]" target="_blank">Biz360 spider</a>', |
| 1330 | | | | 'blogbridge\sservice','<a href="http://www.blogbridge.com/" title="Bot home page [new window]" target="_blank">BlogBridge Service</a>', |
| 1331 | | | | 'bloglines','<a href="http://www.bloglines.com/" title="Bot home page [new window]" target="_blank">Bloglines</a>', |
| 1332 | | | | 'blogpulse','<a href="http://www.intelliseek.com/" title="Bot home page [new window]" target="_blank">BlogPulse ISSpider intelliseek.com</a>', |
| 1333 | | | | 'blogsearch','<a href="http://www.icerocket.com/" title="Bot home page [new window]" target="_blank">BlogSearch</a>', |
| 1334 | | | | 'blogshares','<a href="http://blogshares.com/help.php?node=7" title="Bot home page [new window]" target="_blank">Blogshares Spiders</a>', |
| 1335 | | | | 'blogslive','<a href="http://www.blogslive.com/" title="info@blogslive.com Bot home page [new window]" target="_blank">Blogslive</a>', |
| 1336 | | | | 'blogssay','<a href="http://www.blogssay.com/" title="Bot home page [new window]" target="_blank">BlogsSay :: RSS Search Crawler</a>', |
| 1337 | | | | 'bncf\.firenze\.sbn\.it\/raccolta\.txt','<a href="http://www.bncf.firenze.sbn.it/raccolta.txt" title="Bot home page [new window]" target="_blank">Biblioteca Nazionale Centrale di Firenze</a>', |
| 1338 | | | | 'bobby', 'Bobby', |
| 1339 | | | | 'boitho\.com\-dc','<a href="http://www.boitho.com/dcbot.html" title="Bot home page [new window]" target="_blank">boitho.com-dc</a>', |
| 1340 | | | | 'bookmark\-manager','<a href="http://bkm.sourceforge.net/" title="Bookmark-Manager home page [new window]" target="_blank">Bookmark-Manager</a>', |
| 1341 | | | | 'boris', 'Boris', |
| 1342 | | | | 'bumblebee', 'Bumblebee (relevare.com)', |
| 1343 | | | | 'candlelight\_favorites\_inspector','<a href="http://www.candlelight.com/home.html" title="Candlelight_Favorites_Inspector home page [new window]" target="_blank">Candlelight_Favorites_Inspector</a>', |
| 1344 | | | | 'cbn00glebot','cbn00glebot', |
| 1345 | | | | 'cerberian\sdrtrs','<a href="http://www.pgts.com.au/cgi-bin/psql?robot_info=25240" title="Bot home page [new window]" target="_blank">Cerberian Drtrs</a>', |
| 1346 | | | | 'cfnetwork','<a href="http://www.cocoadev.com/index.pl?CFNetwork" title="CFNetwork home page [new window]" target="_blank">CFNetwork</a>', |
| 1347 | | | | 'cipinetbot','<a href="http://www.cipinet.com/bot.html" title="CipinetBot home page [new window]" target="_blank">CipinetBot</a>', |
| 1348 | | | | 'checkweb\slink\svalidator','<a href="http://p.duby.free.fr/chkweb.htm" title="CheckWeb link validator home page [new window]" target="_blank">CheckWeb link validator</a>', |
| 1349 | | | | 'commons\-httpclient','<a href="http://jakarta.apache.org/commons/httpclient/" title="Bot home page [new window]" target="_blank">Jakarta commons-httpclient</a>', |
| 1350 | | | | 'computer\sand\sautomation\sresearch\sinstitute\scrawler','<a href="http://www.ilab.sztaki.hu/~stamas/publications/p184-benczur.html" title="Computer and Automation Research Institute Crawler home page [new window]" target="_blank">Computer and Automation Research Institute Crawler</a>', |
| 1351 | | | | 'converamultimediacrawler','<a href="http://www.authoritativeweb.com/crawl/" title="ConveraMultiMediaCrawler home page [new window]" target="_blank">ConveraMultiMediaCrawler</a>', |
| 1352 | | | | 'converacrawler','<a href="http://www.authoritativeweb.com/crawl/" title="ConveraCrawler home page [new window]" target="_blank">ConveraCrawler</a>', |
| 1353 | | | | 'cscrawler','CsCrawler', |
| 1354 | | | | 'cse\shtml\svalidator\slite\sonline','<a href="http://online.htmlvalidator.com/php/onlinevallite.php" title="CSE HTML Validator Lite Online home page [new window]" target="_blank">CSE HTML Validator Lite Online</a>','cuasarbot','<a href="http://www.cuasar.com/" title="Cuasarbot home page [new window]" target="_blank">Cuasarbot</a>', |
| 1355 | | | | 'cursor','<a href="http://adcenter.hu/docs/en/bot.html " title="Cursor home page [new window]" target="_blank">Cursor</a>', |
| 1356 | | | | 'custo','<a href="http://www.netwu.com/custo/" title="Custo home page [new window]" target="_blank">Custo</a>', |
| 1357 | | | | 'datafountains/dmoz\sdownloader','<a href="http://infomine.ucr.edu/ " title="DataFountains/DMOZ Downloader home page [new window]" target="_blank">DataFountains/DMOZ Downloader</a>', |
| 1358 | | | | 'daviesbot', 'DaviesBot', |
| 1359 | | | | 'daypopbot', 'DayPop', |
| 1360 | | | | 'deepindex','<a href="http://www.deepindex.net/faq.php" title="Deepindex home page [new window]" target="_blank">Deepindex</a>', |
| 1361 | | | | 'dipsie\.bot','<a href="http://www.dipsie.com/bot/" title="Bot home page [new window]" target="_blank">Dipsie</a>', |
| 1362 | | | | 'dnsgroup','<a href="http://www.dnsgroup.com/" title="DNSGroup home page [new window]" target="_blank">DNSGroup</a>', |
| 1363 | | | | 'docomo','<a href="http://www.nttdocomo.co.jp/" title="DoCoMo home page [new window]" target="_blank">DoCoMo</a>', |
| 1364 | | | | 'domainchecker','<a href="http://net-promoter.com/" title="DomainChecker home page (not confirmed) [new window]" target="_blank">DomainChecker</a>', |
| 1365 | | | | 'domainsdb\.net','<a href="http://domainsdb.net/" title="Bot home page [new window]" target="_blank">DomainsDB.net</a>', |
| 1366 | | | | 'dulance','<a href="http://www.dulance.com/bot.jsp" title="Bot home page [new window]" target="_blank">Dulance</a>', |
| 1367 | | | | 'dumbot','<a href="http://www.dumbfind.com/" title="Dumbot home page [new window]" target="_blank">Dumbot</a>', |
| 1368 | | | | 'dumm\.de\-bot','<a href="http://www.dumm.de/" title="dumm.de-Bot home page [new window]" target="_blank">dumm.de-Bot</a>', |
| 1369 | | | | 'earthcom\.info','<a href="http://www.earthcom.info/" title="Bot home page [new window]" target="_blank">EARTHCOM.info</a>', |
| 1370 | | | | 'easydl','<a href="http://keywen.com/Encyclopedia/Bot/" title="EasyDL home page [new window]" target="_blank">EasyDL</a>', |
| 1371 | | | | 'edgeio\-retriever','<a href="http://www.edgeio.com/" title="Bot home page [new window]" target="_blank">edgeio-retriever</a>', |
| 1372 | | | | 'ets\sv','<a href="http://www.freetranslation.com/help/" title="ETS home page [new window]" target="_blank">ETS</a> Enterprise Translation Server', |
| 1373 | | | | 'exactseek','ExactSeek Crawler', |
| 1374 | | | | 'extreme\_picture\_finder','<a href="http://www.exisoftware.com/" title="Extreme_Picture_Finder home page [new window]" target="_blank">Extreme_Picture_Finder</a>', |
| 1375 | | | | 'eventax','<a href="http://www.eventax.de/" title="eventax home page [new window]" target="_blank">eventax</a>', |
| 1376 | | | | 'everbeecrawler','EverbeeCrawler', |
| 1377 | | | | 'everest\-vulcan','<a href="http://everest.vulcan.com/crawlerhelp" title="Bot home page [new window]" target="_blank">Everest-Vulcan</a>', |
| 1378 | | | | 'ezresult', 'Ezresult', |
| 1379 | | | | 'enteprise','<a href="http://www.fastsearch.com/" title="Bot home page [new window]" target="_blank">Fast Enteprise Crawler</a>', |
| 1380 | | | | 'fast\-search\-engine','<a href="http://www.fast-search-engine.com/" title="Bot home page [new window]" target="_blank">Fast-Search-Engine</a> (not fastsearch.com)', |
| 1381 | | | | 'fast\senterprise\scrawler','<a href="http://www.fast.no/" title="FAST Enterprise Crawler home page [new window]" target="_blank">FAST Enterprise Crawler</a>', |
| 1382 | | | | 'fast\senterprise\scrawler.*scrawleradmin\.t\-info@telekom\.de','<a href="http://www.telekom.de/" title="FAST Enterprise Crawler * crawleradmin.t-info@telekom.de home page [new window]" target="_blank">FAST Enterprise Crawler * crawleradmin.t-info@telekom.de</a>', |
| 1383 | | | | 'matrix\ss\.p\.a\.\s\-\sfast\senterprise\scrawler','<a href="http://tin.virgilio.it/" title="Matrix S.p.A. - FAST Enterprise Crawler home page [new window]" target="_blank">Matrix S.p.A. - FAST Enterprise Crawler</a>', |
| 1384 | | | | 'fast\senterprise\scrawler.*t\-info_bi_cluster\scrawleradmin\.t\-info@telekom\.de','<a href="http://www.telekom.de/" title="FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de home page [new window]" target="_blank">FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de</a>', |
| 1385 | | | | 'favorg','<a href="http://www.pcmag.com/article2/0,4149,108438,00.asp" title="FavOrg home page [new window]" target="_blank">FavOrg</a>', |
| 1386 | | | | 'favorites\ssweeper','<a href="http://www.manitools.com/favsweep/" title="Favorites_Sweeper home page [new window]" target="_blank">Favorites Sweeper</a>', |
| 1387 | | | | 'feedburner', 'Feedburner', |
| 1388 | | | | 'feedfetcher\-google','<a href="http://www.google.com/feedfetcher.html" title="Bot home page [new window]" target="_blank">Feedfetcher-Google</a>', |
| 1389 | | | | 'feedflow','<a href="http://feedflow.com/about" title="Bot home page [new window]" target="_blank">FeedFlow</a>', |
| 1390 | | | | 'feedster','<a href="http://www.feedster.com/" title="Bot home page [new window]" target="_blank">Feedster</a>', |
| 1391 | | | | 'feedvalidator','<a href="http://feedvalidator.org/" title="FeedValidator home page [new window]" target="_blank">FeedValidator</a>', |
| 1392 | | | | 'filmkamerabot','<a href="http://www.filmkamera.at/bot.html" title="FilmkameraBot home page [new window]" target="_blank">FilmkameraBot</a>', |
| 1393 | | | | 'findexa\scrawler','<a href="http://www.findexa.no/gulesider/article26548.ece " title="Findexa Crawler home page [new window]" target="_blank">Findexa Crawler</a>', |
| 1394 | | | | 'geniebot','<a href="http://www.genieknows.com/" title="Bot home page [new window]" target="_blank">Geniebot</a>', |
| 1395 | | | | 'findlinks','<a href="http://wortschatz.uni-leipzig.de/findlinks/" title="Bot home page [new window]" target="_blank">Findlinks</a>', |
| 1396 | | | | 'fooky\.com\/ScorpionBot','<a href="http://www.fooky.com/scorpionbots" title="Fooky.com/ScorpionBot/ScoutOut home page [new window]" target="_blank">Fooky.com/ScorpionBot/ScoutOut</a>', |
| 1397 | | | | 'g2crawler','<a href="http://crawler.instantnetworks.net/" title="Bot home page (nobody@airmail.net) [new window]" target="_blank">G2Crawler</a>', |
| 1398 | | | | 'gaisbot','<a href="http://gais.cs.ccu.edu.tw/robot.php" title="Bot home page [new window]" target="_blank">Gaisbot</a>', |
| 1399 | | | | 'gigabot','<a href="http://www.gigablast.com/spider.html" title="Bot home page [new window]" target="_blank">GigaBot</a>', |
| 1400 | | | | 'girafabot','<a href="http://www.girafa.com/" title="Bot home page [new window]" target="_blank">Girafabot</a>', |
| 1401 | | | | 'global\sfetch','<a href="http://www.wesonet.com/" title="Global Fetch home page [new window]" target="_blank">Global Fetch</a>', |
| 1402 | | | | 'gnodspider','GNOD Spider', |
| 1403 | | | | 'goforit\.com','<a href="http://www.goforit.com/about/" title="GoForIt.com home page [new window]" target="_blank">GoForIt.com</a>', |
| 1404 | | | | 'goforitbot','<a href="http://www.goforit.com/about/" title="GOFORITBOT home page [new window]" target="_blank">GOFORITBOT</a>', |
| 1405 | | | | 'gpu\sp2p\scrawler','<a href="http://gpu.sourceforge.net/search_engine.php" title="Bot home page [new window]" target="_blank">GPU p2p crawler</a>', |
| 1406 | | | | 'grub','Grub.org', |
| 1407 | | | | 'henrythemiragorobot', '<a href="http://www.miragorobot.com/scripts/mrinfo.asp" title="Bot home page [new window]" target="_blank">Mirago</a>', |
| 1408 | | | | 'heritrix','<a href="http://crawler.archive.org/" title="(used by a few different companies) Bot home page [new window]" target="_blank">Heritrix</a>', |
| 1409 | | | | 'holmes', 'Holmes', |
| 1410 | | | | 'hoowwwer','<a href="http://cosco.hiit.fi/search/hoowwwer/" title="HooWWWer home page [new window]" target="_blank">HooWWWer</a>', |
| 1411 | | | | 'hpprint','HPPrint', |
| 1412 | | | | 'htmlparser','<a href="http://htmlparser.sourceforge.net/" title="HTMLParser home page [new window]" target="_blank">HTMLParser</a>', |
| 1413 | | | | 'html\_link\_validator','<a href="http://www.lithopssoft.com/ " title="Html_Link_Validator home page [new window]" target="_blank">Html_Link_Validator</a>', |
| 1414 | | | | 'httrack','<a href="http://www.httrack.com/" title="Bot home page [new window]" target="_blank">HTTrack off-line browser</a>', |
| 1415 | | | | 'hundesuche\.com\-bot','<a href="http://www.hundesuche.com/" title="Hundesuche.com-Bot home page [new window]" target="_blank">Hundesuche.com-Bot</a>', |
| 1416 | | | | 'ichiro','<a href="http://help.goo.ne.jp/door/crawlerE.html" title="Bot home page [new window]" target="_blank">ichiro</a>', |
| 1417 | | | | 'iltrovatore\-setaccio','<a href="http://www.iltrovatore.it/aiuto/motore_di_ricerca.html" title="bot@iltrovatore.it IlTrovatore-Setaccio home page [new window]" target="_blank">IlTrovatore-Setaccio</a>', |
| 1418 | | | | 'infobot','<a href="http://www.infobot.org/" title="InfoBot home page [new window]" target="_blank">InfoBot</a>', |
| 1419 | | | | 'infociousbot','<a href="http://corp.infocious.com/tech_crawler.php" title="InfociousBot home page [new window]" target="_blank">InfociousBot</a>', |
| 1420 | | | | 'infomine','<a href="http://infomine.ucr.edu/useragents" title="Bot home page [new window]" target="_blank">INFOMINE VLCrawler</a>', |
| 1421 | | | | 'insurancobot','<a href="http://www.fastspywareremoval.com/" title="InsurancoBot home page [new window]" target="_blank">InsurancoBot</a>', |
| 1422 | | | | 'internet\_ninja','<a href="http://www.dti.ne.jp/ " title="Internet_Ninja home page [new window]" target="_blank">Internet_Ninja </a>', |
| 1423 | | | | 'internetarchive','<a href="http://lucene.apache.org/nutch/bot.html " title="InternetArchive home page [new window]" target="_blank">InternetArchive</a>', |
| 1424 | | | | 'internetseer', 'InternetSeer', |
| 1425 | | | | 'internetsupervision','<a href="http://internetsupervision.com/" title="InternetSupervision home page [new window]" target="_blank">InternetSupervision</a>', |
| 1426 | | | | 'irlbot','<a href="http://irl.cs.tamu.edu/crawler" title="Bot home page [new window]" target="_blank">IRLbot</a>', |
| 1427 | | | | 'isearch2006','<a href="http://www.yahoo.com.cn/" title="isearch2006 home page [new window]" target="_blank">isearch2006</a>', |
| 1428 | | | | 'iupui_research_bot','<a href="http://spamhuntress.com/2005/04/25/a-mail-harvester-visits/" title="IUPUI_Research_Bot home page [new window]" target="_blank">IUPUI_Research_Bot</a>', |
| 1429 | | | | 'jrtwine\_software\_check\_favorites\_utility','<a href="http://www.jrtwine.com/Products/CheckFavs/" title="JRTwine_Software_Check_Favorites_Utility home page [new window]" target="_blank">JRTwine_Software_Check_Favorites_Utility</a>', |
| 1430 | | | | 'justview', 'JustView', |
| 1431 | | | | 'kalambot','<a href="http://64.124.122.251/feedback.html" title="KalamBot home page [new window]" target="_blank">KalamBot</a>', |
| 1432 | | | | 'kamano\.de\snewsfeedverzeichnis','<a href="http://www.kamano.de/" title="kamano.de NewsFeedVerzeichnis home page [new window]" target="_blank">kamano.de NewsFeedVerzeichnis</a>', |
| 1433 | | | | 'kazoombot','<a href="http://www.kazoom.ca/bot.html" title="kazoombot@kazoom.ca KazoomBot home page [new window]" target="_blank">KazoomBot</a>', |
| 1434 | | | | 'kevin','<a href="http://dznet.com/kevin/" title="Kevin home page [new window]" target="_blank">Kevin</a>', |
| 1435 | | | | 'keyoshid','<a href="http://www.yahoo.co.jp/" title="Bot home page [new window]" target="_blank">Yahoo! Japan keyoshid robot study</a>', |
| 1436 | | | | 'kinjabot', 'Kinjabot', |
| 1437 | | | | 'kinja\-imagebot', 'Kinja Imagebot', |
| 1438 | | | | 'knowitall','<a href="http://www.cs.washington.edu/research/knowitall/" title="KnowItAll home page [new window]" target="_blank">KnowItAll</a>', |
| 1439 | | | | 'knowledge\.com','<a href="http://www.knowledge.com/" title="Knowledge.com home page [new window]" target="_blank">Knowledge.com</a>', |
| 1440 | | | | 'kouaa\skrawler','<a href="http://www.kouaa.com/" title="Kouaa Krawler home page [new window]" target="_blank">Kouaa Krawler</a>', |
| 1441 | | | | 'krugle','<a href="http://www.krugle.com/crawler/info.html" title="Bot home page [new window]" target="_blank">Krugle</a>', |
| 1442 | | | | 'ksibot','<a href="http://ego.ms.mff.cuni.cz/" title="Bot home page [new window]" target="_blank">ksibot</a>', |
| 1443 | | | | 'kurzor','<a href="http://www.easymail.hu/" title="cursor@easymail.hu Kurzor home page [new window]" target="_blank">Kurzor</a>', |
| 1444 | | | | 'lanshanbot','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb" title="Bot Information [new window]" target="_blank">lanshanbot</a>', |
| 1445 | | | | 'letscrawl\.com','<a href="http://letscrawl.com/" title="Bot home page [new window]" target="_blank">LetsCrawl.com</a>', |
| 1446 | | | | 'libcrawl','Crawl libcrawl', |
| 1447 | | | | 'link\svalet\sonline','<a href="http://www.htmlhelp.com/tools/valet/" title="Link Valet Online home page [new window]" target="_blank">Link Valet Online</a>', |
| 1448 | | | | 'linkbot','LinkBot', |
| 1449 | | | | 'linkchecker','<a href="http://linkchecker.sourceforge.net" title="Bot home page [new window]" target="_blank">LinkChecker</a>', |
| 1450 | | | | 'livejournal\.com', 'LiveJournal.com', |
| 1451 | | | | 'magpierss', 'MagpieRSS', |
| 1452 | | | | 'mapoftheinternet\.com','<a href="http://MapoftheInternet.com/" title="MapoftheInternet.com home page [new window]" target="_blank">MapoftheInternet.com</a>', |
| 1453 | | | | 'mediapartners\-google','<a href="https://adwords.google.com/" title="Bot home page [new window]" target="_blank">Google AdSense</a>', |
| 1454 | | | | 'megite','<a href="http://www.megite.com/" title="Megite home page [new window]" target="_blank">Megite</a>', |
| 1455 | | | | 'metager\-linkchecker','MetaGer LinkChecker', |
| 1456 | | | | 'metaspinner','<a href="http://index.meta-spinner.de/" title="Metaspinner home page [new window]" target="_blank">Metaspinner</a>', |
| 1457 | | | | 'microsoft\surl\scontrol','<a href="http://www.webmasterworld.com/forum11/1005.htm" title="Microsoft URL Control home page [new window]" target="_blank">Microsoft URL Control</a>', |
| 1458 | | | | 'minirank','<a href="http://minirank.com/" title="miniRank home page [new window]" target="_blank">miniRank</a>', |
| 1459 | | | | 'mini\-reptile','Mini-reptile', |
| 1460 | | | | 'missigua\slocator','<a href="http://www.webmasterworld.com/forum11/2690.htm" title="Missigua_Locator home page [new window]" target="_blank">Missigua_Locator</a>', |
| 1461 | | | | 'misterbot','<a href="http://www.misterbot.fr/" title="Misterbot home page [new window]" target="_blank">Misterbot</a>', |
| 1462 | | | | 'miva','<a href="http://www.miva.com/" title="Miva home page [new window]" target="_blank">Miva</a>', |
| 1463 | | | | 'mizzu\slabs','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b " title="Mizzu Labs home page [new window]" target="_blank">Mizzu Labs</a>', |
| 1464 | | | | 'mj12bot','<a href="http://majestic12.co.uk/bot.php" title="Bot home page. [new window]" target="_blank">MJ12bot</a>', |
| 1465 | | | | 'mojeekbot','<a href="http://www.mojeek.com/bot.html" title="Bot home page. [new window]" target="_blank">MojeekBot</a>', |
| 1466 | | | | 'tencenttraveler','TencentTraveler', # Must be before msiecrawler. |
| 1467 | | | | 'msiecrawler','<a href="http://msdn.microsoft.com/workshop/delivery/offline/linkrel.asp" title="Bot home page. [new window]" target="_blank">MSIECrawler</a>', |
| 1468 | | | | 'ms\ssearch\s4\.0\srobot','<a href="http://support.microsoft.com/default.aspx?scid=kb;en-us;284022" title="Bot home page. [new window]" target="_blank">MS SharePoint Portal Server - MS Search 4.0 Robot</a>', |
| 1469 | | | | 'msrabot','msrabot', |
| 1470 | | | | 'msrbot','<a href="http://research.microsoft.com/research/sv/msrbot/" title="MSRBOT home page [new window]" target="_blank">MSRBOT</a>', |
| 1471 | | | | 'mt::telegraph::agent','MT::Telegraph::Agent', |
| 1472 | | | | 'mydoyouhike','<a href="http://www.doyouhike.net/my" title="Mydoyouhike home page [new window]" target="_blank">Mydoyouhike</a>', |
| 1473 | | | | 'nagios','Nagios', |
| 1474 | | | | 'nasa\ssearch','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b" title="NASA Search home page [new window]" target="_blank">NASA Search</a>', |
| 1475 | | | | 'netluchs','<a href="http://www.netluchs.de/" title="Bot home page. [new window]" target="_blank">Netluchs</a>', |
| 1476 | | | | 'netsprint','<a href="http://www.netsprint.pl/serwis/" title="NetSprint home page [new window]" target="_blank">NetSprint</a>', |
| 1477 | | | | 'newsgatoronline', 'NewsGator Online', |
| 1478 | | | | 'nicebot','<a href="http://www.egghelp.org/setup.htm" title="Bot home page (there may be others) [new window]" target="_blank">nicebot</a>', |
| 1479 | | | | 'nimblecrawler','<a href="http://www.healthline.com/" title="NimbleCrawler home page [new window]" target="_blank">NimbleCrawler</a>', |
| 1480 | | | | 'noxtrumbot','<a href="http://www.noxtrum.com/" title="Bot home page [new window]" target="_blank">noxtrumbot</a>', |
| 1481 | | | | 'npbot','<a href="http://www.nameprotect.com/botinfo.html" title="NPBot home page [new window]" target="_blank">NPBot</a>', |
| 1482 | | | | 'nutchcvs','<a href="http://lucene.apache.org/nutch/bot.html" title="NutchCVS home page [new window]" target="_blank">NutchCVS</a>', |
| 1483 | | | | 'nutchosu\-vlib','<a href="http://lucene.apache.org/nutch/bot.html" title="NutchOSU-VLIB home page [new window]" target="_blank">NutchOSU-VLIB</a>', |
| 1484 | | | | 'nutch','<a href="http://lucene.apache.org/nutch/" title="Bot home page. Used by many, including Looksmart. [new window]" target="_blank">Nutch</a>', |
| 1485 | | | | 'ocelli','<a href="http://www.globalspec.com/Ocelli/" title="Ocelli home page [new window]" target="_blank">Ocelli</a>', |
| 1486 | | | | 'octora\sbeta\sbot','<a href="http://www.octora.com/" title="Bot home page [new window]" target="_blank">Octora Beta Bot</a>', |
| 1487 | | | | 'omniexplorer\_bot','<a href="http://www.omni-explorer.com/" title="Bot home page. [new window]" target="_blank">OmniExplorer Bot</a>', |
| 1488 | | | | 'onet\.pl\_sa','<a href="http://szukaj.onet.pl/" title="Onet.pl_SA home page [new window]" target="_blank">Onet.pl_SA</a>', |
| 1489 | | | | 'onfolio','<a href="http://www.onfolio.com/" title="Bot home page [new window]">Onfolio</a>', |
| 1490 | | | | 'opentaggerbot','<a href="http://www.opentagger.com/opentaggerbot.htm" title="Bot home page [new window]">OpenTaggerBot</a>', |
| 1491 | | | | 'openwebspider','<a href="http://www.openwebspider.org/" title="OpenWebSpider home page [new window]" target="_blank">OpenWebSpider</a>', |
| 1492 | | | | 'oracle\sultra\ssearch','<a href="http://www.oracle.com/technology/products/ultrasearch/index.html" title="Oracle Ultra Search home page [new window]" target="_blank">Oracle Ultra Search</a>', |
| 1493 | | | | 'orbiter','<a href="http://www.dailyorbit.com/bot.htm" title="Orbiter home page [new window]" target="_blank">Orbiter</a>', |
| 1494 | | | | 'outfoxbot','<a href="mailto:outfox.agent@gmail.com?subject=Outfox Bot Information" title="Bot e-mail.">OutfoxBot</a>', |
| 1495 | | | | 'passwordmaker\.org','<a href="http://passwordmaker.org/" title="passwordmaker.org home page [new window]" target="_blank">passwordmaker.org</a>', |
| 1496 | | | | 'pear\shttp\srequest\sclass','<a href="http://pear.php.net/" title="PEAR HTTP Request class home page [new window]" target="_blank">PEAR HTTP Request class</a>', |
| 1497 | | | | 'peerbot','<a href="http://www.peerbot.com/" title="PEERbot home page [new window]" target="_blank">PEERbot</a>', |
| 1498 | | | | 'perman', 'Perman surfer', |
| 1499 | | | | 'php\_version\_tracker','<a href="http://www.nexen.net/phpversion/bot.php" title="PHP_version_tracker home page [new window]" target="_blank">PHP_version_tracker</a>', |
| 1500 | | | | 'php\sversion\stracker','<a href="http://www.nexen.net/phpversion/bot.php" title="PHP version tracker home page [new window]" target="_blank">PHP version tracker</a>', |
| 1501 | | | | 'pictureofinternet','<a href="http://malfunction.org/poi/" title="PictureOfInternet home page [new window]" target="_blank">PictureOfInternet</a>', |
| 1502 | | | | 'ping\.blo\.gs','<a href="http://blo.gs/ping.php" title="Bot home page. [new window]" target="_blank">ping.blo.gs</a>', |
| 1503 | | | | 'plinki','<a href="http://www.plinki.com/" title="plinki home page [new window]" target="_blank">plinki</a>', |
| 1504 | | | | 'pluckfeedcrawler','<a href="http://www.pluck.com/" title="Bot home page. [new window]" target="_blank">PluckFeedCrawler</a>', |
| 1505 | | | | 'pompos','<a href="http://dir.com/pompos.html" title="Bot home page. [new window]" target="_blank">Pompos</a>', |
| 1506 | | | | 'popdexter','Popdexter', |
| 1507 | | | | 'port\shuron\slabs','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b" title="Port Huron Labs home page [new window]" target="_blank">Port Huron Labs</a>', |
| 1508 | | | | 'postfavorites','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b " title="PostFavorites home page [new window]" target="_blank">PostFavorites</a>', |
| 1509 | | | | 'projectwf\-java\-test\-crawler','ProjectWF-java-test-crawler', |
| 1510 | | | | 'proodlebot','<a href="http://www.proodle.com/" title="proodleBot home page [new window]" target="_blank">proodleBot</a>', |
| 1511 | | | | 'pyquery','<a href="http://sourceforge.net/projects/pyquery/" title="PyQuery home page [new window]" target="_blank">PyQuery</a>', |
| 1512 | | | | 'rambler','<a href="http://www.rambler.ru/doc/faq.shtml" title="Bot home page [new window]">StackRambler</a>', |
| 1513 | | | | 'redalert','Red Alert', |
| 1514 | | | | 'relevantnoise\.com', '<a href="http://www.relevantnoise.com/" title="Relevant Noise [new window]" target="_blank">Relevant Noise</a>', |
| 1515 | | | | 'rojo','<a href="http://rojo.com/" title="Bot home page [new window]" target="_blank">RoJo</a> aggregator', |
| 1516 | | | | 'rssimagesbot','<a href="http://herbert.groot.jebbink.nl/?app=rssImages" title="Bot home page [new window]" target="_blank">rssImagesBot</a>', |
| 1517 | | | | 'ruffle','<a href="http://www.unreach.net/" title="Bot home page [new window]" target="_blank">ruffle SemanticWeb crawler</a>', |
| 1518 | | | | 'rufusbot','<a href="http://64.124.122.252.webaroo.com/feedback.html" title="Bot home page [new window]" target="_blank">RufusBot Rufus Web Miner</a>', |
| 1519 | | | | 'sandcrawler','<a href="http://www.microsoft.com/" title="Bot home page [new window]" target="_blank">SandCrawler (Microsoft)</a>', |
| 1520 | | | | 'sbider','<a href="http://www.sitesell.com/sbider.html" title="Bot home page [new window]" target="_blank">SBIder</a>', |
| 1521 | | | | 'schizozilla','<a href="http://spamhuntress.com/2005/03/18/gizmo/ " title="Schizozilla home page [new window]" target="_blank">Schizozilla</a>', |
| 1522 | | | | 'scumbot','Scumbot', |
| 1523 | | | | 'searchguild\_dmoz\_experiment','<a href="http://www.searchguild.com/" title="SearchGuild_DMOZ_Experiment home page [new window]" target="_blank">SearchGuild_DMOZ_Experiment</a>', |
| 1524 | | | | 'seekbot','<a href="http://www.seekbot.net/bot.html" title="Bot home page [new window]">Seekbot</a>', |
| 1525 | | | | 'sensis\sweb\scrawler','<a href="http://www.sensis.com.au/" title="Sensis Web Crawler home page [new window]" target="_blank">Sensis Web Crawler</a>', |
| 1526 | | | | 'seznambot','<a href="http://fulltext.seznam.cz/" title="Bot home page [new window]" target="_blank">SeznamBot</a>', |
| 1527 | | | | 'shim\-crawler','<a href="http://www.logos.ic.i.u-tokyo.ac.jp/crawler/" title="crawl@logos.ic.i.u-tokyo.ac.jp Bot home page [new window]" target="_blank">Shim-Crawler</a>', |
| 1528 | | | | 'shoutcast','Shoutcast Directory Service', |
| 1529 | | | | 'slysearch','SlySearch', |
| 1530 | | | | 'snap\.com\sbeta\scrawler','<a href="http://www.snap.com/" title="snap.com beta crawler home page [new window]" target="_blank">snap.com beta crawler</a>', |
| 1531 | | | | 'sogou\sspider','<a href="http://corp.sohu.com/20051130/n240842344.shtml" title="Bot home page [new window]" target="_blank">sogou spider</a>', |
| 1532 | | | | 'sogou\stest','<a href="http://corp.sohu.com/20051130/n240842344.shtml" title="Bot home page [new window]" target="_blank">sogou test</a>', |
| 1533 | | | | 'sohu\-search','<a href="http://corp.sohu.com/" title="Bot home page [new window]" target="_blank">sohu-search</a>', |
| 1534 | | | | 'sohu','<a href="http://corp.sohu.com/" title="Bot home page [new window]" target="_blank">sohu agent</a>', |
| 1535 | | | | 'snappy','<a href="http://www.urltrends.com/faq.php" title="Bot home page [new window]" target="_blank">Snappy</a>', |
| 1536 | | | | 'sphere\sscout','<a href="http://www.sphere.com/" title="Bot home page [new window]" target="_blank">Sphere Scout</a>', |
| 1537 | | | | 'sproose\scrawler','<a href="http://www.sproose.com/bot.html" title="Bot home page [new window]" target="_blank">sproose crawler</a>', |
| 1538 | | | | 'steroid\s\sdownload','<a href="http://faqs.org.ru/progr/pascal/delphi_internet2.htm" title="STEROID Download home page [new window]" target="_blank">STEROID Download</a>', |
| 1539 | | | | 'steeler','<a href="http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ " title="Steeler home page [new window]" target="_blank">Steeler</a>', |
| 1540 | | | | 'suchfin\-bot','<a href="http://www.suchfin.de/" title="Suchfin-Bot home page [new window]" target="_blank">Suchfin-Bot</a>', |
| 1541 | | | | 'superbot','<a href="http://www.sparkleware.com/superbot/" title="SuperBot home page [new window]" target="_blank">SuperBot</a>', |
| 1542 | | | | 'surveybot','SurveyBot', |
| 1543 | | | | 'susie','<a href="http://www.sync2it.com/bms/susie.php" title="Susie home page [new window]" target="_blank">Susie</a>', |
| 1544 | | | | 'syndic8','Syndic8', |
| 1545 | | | | 'syndicapi','<a href="http://syndicapi.com/bot.html" title="Bot home page [new window]" target="_blank">SyndicAPI</a>', |
| 1546 | | | | 'synoobot','<a href="http://www.synoo.de/bot.html" title="webmaster@synoo.com SynooBot home page [new window]" target="_blank">SynooBot</a>', |
| 1547 | | | | 'tcl\shttp\sclient\spackage','<a href="http://www.tcl.tk/man/tcl8.4/TclCmd/http.htm" title="Tcl http client package home page [new window]" target="_blank">Tcl http client package</a>', |
| 1548 | | | | 'technoratibot', 'Technoratibot', |
| 1549 | | | | 'teragramcrawlersurf','<a href="http://www.teragram.com/" title="TeragramCrawlerSURF home page [new window]" target="_blank">TeragramCrawlerSURF</a>', |
| 1550 | | | | 'test\scrawler','<a href="http://netp.ath.cx/" title="Test Crawler home page [new window]" target="_blank">Test Crawler</a>', |
| 1551 | | | | 'testbot','<a href="http://www.agbrain.com/" title="TestBot home page [new window]" target="_blank">TestBot</a>', |
| 1552 | | | | 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e','<a href="http://www.thunderstone.com/" title="Bot home page. Used by many. [new window]" target="_blank">T-H-U-N-D-E-R-S-T-O-N-E</a>', |
| 1553 | | | | 'topicblogs', '<a href="http://www.topicblogs.com/" title="Bot home page [new window]" target="_blank">topicblogs</a>', |
| 1554 | | | | 'turnitinbot','Turn It In', |
| 1555 | | | | 'turtle', 'Turtle', |
| 1556 | | | | 'turtlescanner', 'Turtle', |
| 1557 | | | | 'tutorgigbot','<a href="http://www.tutorgig.info/" title="TutorGigBot home page [new window]" target="_blank">TutorGigBot</a>', |
| 1558 | | | | 'ubicrawler','<a href="http://law.dsi.unimi.it/ubicrawler/" title="Bot home page [new window]" target="_blank">UbiCrawler</a>', |
| 1559 | | | | 'ultraseek', 'Ultraseek', |
| 1560 | | | | 'unchaos\sbot\shybrid\sweb\ssearch\sengine','<a href="http://www.unchaos.com/" title="UnChaos Bot Hybrid Web Search Engine home page [new window]" target="_blank">UnChaos Bot Hybrid Web Search Engine</a>', |
| 1561 | | | | 'unido\-bot','<a href="http://www.unchina.org/unido/unido/our_projects/3_3.html" title="unido-bot home page [new window]" target="_blank">unido-bot</a>', |
| 1562 | | | | 'up\.browser','<a href="http://developer.openwave.com/dvl/support/faqs/faq_mag_browser.htm" title="UP.Browser home page [new window]" target="_blank">UP.Browser</a>', |
| 1563 | | | | 'updated','<a href="http://www.updated.com/" title="updated home page [new window]" target="_blank">updated</a>', |
| 1564 | | | | 'ustc\-semantic\-group','<a href="http://ai.ustc.edu.cn/mas/en/research/index.php" title="Bot home page [new window]" target="_blank">USTC-Semantic-Group</a>', |
| 1565 | | | | 'vagabondo\-wap','<a href="http://www.wise-guys.nl/Contact/index.php?botselected=webagents&lang=uk" title="Bot home page [new window]" target="_blank">Vagabondo-WAP</a>', |
| 1566 | | | | 'vagabondo','<a href="http://www.wise-guys.nl/Contact/index.php?botselected=webagents&lang=uk" title="Bot home page [new window]" target="_blank">Vagabondo</a>', |
| 1567 | | | | 'vermut','<a href="http://vermut.aol.com/" title="Bot home page [new window]" target="_blank">Vermut</a>', |
| 1568 | | | | 'versus\scrawler\sfrom\seda\.baykan@epfl\.ch','<a href="http://www.epfl.ch/Eindex.html " title="versus crawler from eda.baykan@epfl.ch home page [new window]" target="_blank">versus crawler from eda.baykan@epfl.ch</a>', |
| 1569 | | | | 'vespa\scrawler','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb" title="Bot home page [new window]" target="_blank">Vespa Crawler</a>', |
| 1570 | | | | 'vortex','<a href="http://marty.anstey.ca/projects/robots/vortex/" title="Bot home page [new window]" target="_blank">VORTEX</a>', |
| 1571 | | | | 'vse','<a href="http://www.vivisimo.com/" title="VSE home page [new window]" target="_blank">VSE</a>', |
| 1572 | | | | 'w3c\-checklink','<a href="http://validator.w3.org/checklink/" title="Bot home page [new window]" target="_blank">W3C Link Checker</a>', |
| 1573 | | | | 'w3c\_css\_validator\_jfouffa', '<a href="http://jigsaw.w3.org/css-validator/" title="Bot home page [new window]" target="_blank">W3C jigsaw CSS Validator</a>', |
| 1574 | | | | 'w3c_validator','<a href="http://validator.w3.org/" title="Bot home page [new window]" target="_blank">W3C Validator</a>', |
| 1575 | | | | 'wavefire','<a href="http://www.wavefire.com" title="info@wavefire.com; Bot home page [new window]" target="_blank">Wavefire</a>', |
| 1576 | | | | 'webclipping\.com', 'WebClipping.com', |
| 1577 | | | | 'webcompass', 'webcompass', |
| 1578 | | | | 'webcrawl\.net','<a href="http://www.webcrawl.net/" title="webcrawl.net home page [new window]" target="_blank">webcrawl.net</a>', |
| 1579 | | | | 'web\sdownloader','<a href="http://www.krasu.ru/soft/chuchelo/" title="Web Downloader home page [new window]" target="_blank">Web Downloader</a>', |
| 1580 | | | | 'webdup','<a href="http://www.webdup.com/en/index.html" title="Webdup home page [new window]" target="_blank">Webdup</a>', |
| 1581 | | | | 'webfilter','<a href="http://www.verso.com/enterprise/netspective/webfilter.asp" title="Bot home page [new window]" target="_blank">WebFilter</a>', |
| 1582 | | | | 'webindexer','<a href="mailto://webindexerv1@yahoo.com" title="WebIndexer home page [new window]" target="_blank">WebIndexer</a>', |
| 1583 | | | | 'webminer','<a href="http://64.124.122.252/feedback.html" title="WebMiner home page [new window]" target="_blank">WebMiner</a>', |
| 1584 | | | | 'website\_monitoring\_bot','<a href="http://InternetSupervision.com/UrlMonitor/3/" title="Website_Monitoring_Bot home page [new window]" target="_blank">Website_Monitoring_Bot</a>', |
| 1585 | | | | 'webvulncrawl', 'WebVulnCrawl', |
| 1586 | | | | 'wells\ssearch','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b " title="Wells Search home page [new window]" target="_blank">Wells Search</a>', |
| 1587 | | | | 'wonderer', 'Web Wombat Redback Spider', |
| 1588 | | | | 'wume\scrawler','<a href="http://wume.cse.lehigh.edu/~xiq204/crawler/ " title="wume crawler home page [new window]" target="_blank">wume crawler</a>', |
| 1589 | | | | 'wwweasel',,'<a href="http://wwweasel.de/" title="Website_Monitoring_Bot home page [new window]" target="_blank">WWWeasel</a>', |
| 1590 | | | | 'xenu\'s\slink\ssleuth','<a href="http://home.snafu.de/tilman/xenulink.html" title="Xenu Link Sleuth home page [new window]" target="_blank">Xenu Link Sleuth</a>', |
| 1591 | | | | 'xenu\slink\ssleuth','<a href="http://home.snafu.de/tilman/xenulink.html" title="Xenu Link Sleuth home page [new window]" target="_blank">Xenu Link Sleuth</a>', |
| 1592 | | | | 'xirq','<a href="http://www.xirq.com/" title="xirq home page [new window]" target="_blank">xirq</a>', |
| 1593 | | | | 'y!j', '<a href="http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html" title="Bot home page [new window]" target="_blank">Y!J Yahoo Japan</a>', |
| 1594 | | | | 'yacy','<a href="http://www.yacy.net/yacy" title="Bot home page [new window]" target="_blank">yacy</a>', |
| 1595 | | | | 'yahoo\-blogs','<a href="http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html" title="Bot home page [new window]" target="_blank">Yahoo-Blogs</a>', |
| 1596 | | | | 'yahoo\-verticalcrawler', 'Yahoo Vertical Crawler', |
| 1597 | | | | 'yahoofeedseeker', '<a href="http://publisher.yahoo.com/rssguide" title="Bot home page [new window]" target="_blank">Yahoo Feed Seeker</a>', |
| 1598 | | | | 'yahooseeker\-testing', '<a href="http://search.yahoo.com/" title="Bot home page [new window]" target="_blank">YahooSeeker-Testing</a>', |
| 1599 | | | | 'yahooseeker', '<a href="http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html" title="Bot home page [new window]" target="_blank">YahooSeeker Yahoo! Blog crawler</a>', |
| 1600 | | | | 'yahoo\-mmcrawler', '<a href="mailto:mms-mmcrawler-support@yahoo-inc.com?subject=Yahoo-MMCrawler Information" title="E-mail Bot">Yahoo-MMCrawler</a>', |
| 1601 | | | | 'yahoo!\smindset','<a href="http://mindset.research.yahoo.com/" title="Bot home page [new window]">Yahoo! Mindset</a>', |
| 1602 | | | | 'yandex', 'Yandex bot', |
| 1603 | | | | 'yooglifetchagent','<a href="http://www.yoogli.com/" title="yoogliFetchAgent home page [new window]" target="_blank">yoogliFetchAgent</a>', |
| 1604 | | | | 'z\-add\slink\schecker','<a href="http://w3.z-add.co.uk/linkcheck/" title="Z-Add Link Checker home page [new window]" target="_blank">Z-Add Link Checker</a>', |
| 1605 | | | | 'zealbot','ZealBot', |
| 1606 | | | | 'zspider','<a href="http://feedback.redkolibri.com/" title="Bot home page [new window]" target="_blank">zspider</a>', |
| 1607 | | | | 'zeus','<a href="http://www.webmasterworld.com/forum11/1840.htm" title="Bot documentation [new window]" target="_blank">Zeus Webster Pro</a>', |
| 1608 | | | | 'ng\/1\.','<a href="http://www.exabot.com/" title="Bot home page [new window]" target="_blank">NG 1.x (Exalead)</a>', # put at end to avoid false positive |
| 1609 | | | | 'ng\/2\.','<a href="http://www.exabot.com/" title="Bot home page [new window]" target="_blank">NG 2.x (Exalead)</a>', # put at end to avoid false positive |
| 1610 | | | | 'exabot','<a href="http://www.exabot.com/" title="Bot home page [new window]" target="_blank">Exabot</a>', # put at end to avoid false positive |
| 1611 | | | | 'java','<a href="http://www.projecthoneypot.org/harvester_useragents.php" title="Bot home page [new window]" target="_blank">Java (Often spam bot)</a>', # put at end to avoid false positive |
| 1612 | | | | # Generic root ID |
| 1613 | | | | 'robot', 'Unknown robot (identified by \'robot\')', |
| 1614 | | | | 'crawl', 'Unknown robot (identified by \'crawl\')', |
| 1615 | | | | 'spider', 'Unknown robot (identified by \'spider\')', |
| 1616 | | | | '\wbot[\/\-]', 'Unknown robot (identified by \'bot/\' or \'bot-\')', |
| 1617 | | | | # Unknown robots identified by hit on robots.txt |
| 1618 | | | | 'unknown', 'Unknown robot (identified by hit on \'robots.txt\')' |
| 1619 | | | | ); |
| 1620 | | | | |
| 1621 | | | | |
| 1622 | | | | # RobotsAffiliateLib |
| 1623 | | | | # This list try to tell by which Search Engine a robot is used |
| 1624 | | | | #------------------------------------------------------------- |
| 1625 | 1 | 1.1e-5 | 1.1e-5 | %RobotsAffiliateLib = ( |
| 1626 | | | | 'fast\-webcrawler'=>'AllTheWeb', |
| 1627 | | | | 'googlebot'=>'Google', |
| 1628 | | | | 'msnbot'=>'MSN', |
| 1629 | | | | 'nutch'=>'Looksmart', |
| 1630 | | | | 'scooter'=>'AltaVista', |
| 1631 | | | | 'wisenutbot'=>'Looksmart', |
| 1632 | | | | 'yahoo\-verticalcrawler'=>'Yahoo', |
| 1633 | | | | 'zyborg'=>'Looksmart', |
| 1634 | | | | 'cfetch'=>'Kosmix', |
| 1635 | | | | '^voyager\/'=>'Kosmix' |
| 1636 | | | | ); |
| 1637 | | | | |
| 1638 | 1 | 0.00040 | 0.00040 | 1; |