summaryrefslogtreecommitdiff
path: root/docs/robotstxt-adventure.html
blob: da7e9e6d891265d37066948e6fa6aa7d020005db (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">

<head profile="http://gmpg.org/xfn/11">
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />

<title>nextthing.org  &raquo; robots.txt Adventure</title>

<meta name="generator" content="WordPress 3.7.3" /> <!-- leave this for stats -->

<link rel="stylesheet" href="http://www.nextthing.org/wordpress/wp-content/themes/mine2_0/style.css" type="text/css" media="screen" />
<link rel="stylesheet" href="http://www.nextthing.org/wordpress/wp-content/themes/mine2_0/print.css" type="text/css" media="print" />
<link rel="alternate" type="application/rss+xml" title="nextthing.org RSS Feed" href="http://www.nextthing.org/feed" />
<link rel="pingback" href="http://www.nextthing.org/wordpress/xmlrpc.php" />

<link rel="alternate" type="application/rss+xml" title="nextthing.org &raquo; robots.txt Adventure Comments Feed" href="http://www.nextthing.org/archives/2007/03/12/robotstxt-adventure/feed" />
<link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.nextthing.org/wordpress/xmlrpc.php?rsd" />
<link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://www.nextthing.org/wordpress/wp-includes/wlwmanifest.xml" /> 
<link rel='prev' title='New Zealand Trip 2006' href='http://www.nextthing.org/archives/2007/02/09/new-zealand-trip-2006' />
<link rel='next' title='Microsoft Web' href='http://www.nextthing.org/archives/2007/03/31/microsoft-web' />
<meta name="generator" content="WordPress 3.7.3" />
<link rel='canonical' href='http://www.nextthing.org/archives/2007/03/12/robotstxt-adventure' />
<link rel='shortlink' href='http://www.nextthing.org/?p=30' />
</head>
<body>
<div id="page">

<div id="header">
  <h1><a href="http://www.nextthing.org/">nextthing.org</a></h1>
  <h2>by Andrew Wooster</h2>
</div>
<hr />

	<div id="content" class="widecolumn">
    
  
		<div class="navigation">
			<div class="alignleft">&laquo; <a href="http://www.nextthing.org/archives/2007/02/09/new-zealand-trip-2006" rel="prev">New Zealand Trip 2006</a></div>
			<div class="alignright"><a href="http://www.nextthing.org/archives/2007/03/31/microsoft-web" rel="next">Microsoft Web</a> &raquo;</div>
		</div>

		<div class="post" id="post-30">
			<h2><a href="http://www.nextthing.org/archives/2007/03/12/robotstxt-adventure" rel="bookmark" title="Permanent Link: robots.txt Adventure">robots.txt Adventure</a></h2>

			<div class="entry">
				<h3>Introduction.txt</h3>
<p>Last October I got bored and set my spider loose on the robots.txt files of the world. Having had a good deal of positive feedback on my <a href="http://www.nextthing.org/archives/2005/08/07/fun-with-http-headers">HTTP Headers survey</a>, I had decided to poke around in robots.txt files and see what sorts of interesting things I could find. </p>
<p>Since then, I&#8217;ve taken <a href="http://www.nextthing.org/archives/2007/02/09/new-zealand-trip-2006">6 weeks of vacation</a> and gotten to be very busy at <a href="http://www.apple.com/macosx/leopard/">work</a>, so I&#8217;m just now getting around to analyzing all the data I gathered. These are some of the results of that analysis.<span id="more-30"></span></p>
<h3>Robots?</h3>
<p>To those of you completely unaware of what this post is about, here&#8217;s a brief primer. Google is a search engine. You probably use it. If not, odds are you use one of MSN Search (now called &#8220;Live Search&#8221;), Ask Jeeves (now Ask.com), or Yahoo! Search. How do those search engines grab web pages to search? Well, they use robots, also called spiders. Now, these aren&#8217;t the giant metal machines you see <a href="http://www.youtube.com/watch?v=ffLt1z7pBNw">chasing tweaked out English factory workers</a> through the streets of London, nor are they the giant eight-legged creatures you find <a href="http://www.urbandictionary.com/define.php?term=Clock+Spider">lurking behind clocks</a>. Rather, they&#8217;re pieces of software that surf around the web grabbing web pages. Since they&#8217;re software, they can surf the web much faster than humans, as well as find things most humans might overlook. As such, there arose a need for a standard for advising robots on what they should and shouldn&#8217;t look at.</p>
<h3>The Un-Standard</h3>
<p>The Robots Exclusion Protocol arose in June 1994 by consensus among a number of web spider developers. The <a href="http://www.robotstxt.org/wc/norobots.html">original protocol description</a> from 1994 describes the basic syntax of a robots.txt file to be placed at the root of a web site. So, for example, Google would place their robots.txt file at:</p>
<pre>

http://www.google.com/robots.txt

</pre>
<p>The basic format goes something like this. First, the file specifies a User-agent (the name of the robot) that is to follow the subsequent rules (until the next User-agent line):</p>
<pre>
User-agent: SuperHappyRobot
</pre>
<p>This line tells &#8220;SuperHappyRobot&#8221; that it needs to pay attention to the next few lines. Any other robot will ignore these rules. The next line might look something like:</p>
<pre>
Disallow: /tmp/
</pre>
<p>Which would mean SuperHappyRobot shouldn&#8217;t download any pages that start with the path &#8220;/tmp/&#8221; from this server. Variations on these lines are that * will match any robot name (in other words, &#8220;User-agent: *&#8221; should tell all the robots to pay attention), and blank Disallow statements mean anything goes. So, Apple&#8217;s robots.txt file of:</p>
<pre>
# robots.txt for http://www.apple.com/
User-agent: *
Disallow: 
</pre>
<p>means, essentially, that any robot is free to grab any page it can get its hands on, at least for the &#8220;www.apple.com&#8221; website.</p>
<p>So, that was all well and good, but around 1996 there was a push to try to get robots.txt standardized, and an IETF draft (http://www.robotstxt.org/wc/norobots-rfc.html) was produced that clarified and added to the robots.txt syntax. The primary addition was a new &#8220;Allow&#8221; rule, which allowed a little more fine-grained control over which pages could be retrieved. For example, with the following set of rules:</p>
<pre>
User-agent: *
Disallow: /apache/
Allow: /apache/02/03/11/2228242.shtml
</pre>
<p>All documents except &#8220;/apache/02/03/11/2228242.shtml&#8221; in the &#8220;/apache/&#8221; path would be excluded from spidering. There was also a provision for &#8220;extensions&#8221; to the protocol, such that a rule line like &#8220;Crawl-delay: 10&#8243; could be added. Spiders that didn&#8217;t support that extension would ignore it, while spiders that did might delay 10 seconds between page fetches.</p>
<p>Around the same time the IETF draft was being discussed, Sean &#8220;Captain Napalm&#8221; Conner proposed <a href="http://www.conman.org/people/spc/robots2.html">his own extension</a> to the Robots Exclusion Protocol, which included Allow rules as well as regular expression syntax for rules, and new Robot-version, Visit-time, Request-rate, and Comment rules. Less than 100 of the sites I visited use rules unique to this spec.</p>
<p>Since none of these three documents have ever been ratified or adopted by a standards body, there has been a bit of persistent confusion over what constitutes a valid robots.txt document. The most definitive document is certainly the original 1994 document. Most commercial robots today, however, attempt to conform to the IETF draft document. And, given the large number of Allow rules around, it would be remiss of a robot not to try.</p>
<h3>A Touch of Controversy</h3>
<p>This de-facto standard has had its share of controversy over the years. Many webmasters object to having to opt-out of spiders crawling their site. Given that I found 47,738 sites that disallow spidering the root of their site with the wildcard (*) user-agent match, it appears that that viewpoint still has many adherents, and many just want to be left alone by the bulk of spiders. See the comments in <a href="http://www.threadwatch.org/node/10055">this thread</a> for some examples of this opinion from some relatively tech-savvy webmasters. Among them is the well-known <a href="http://incredibill.blogspot.com/">IncrediBILL</a>:</p>
<blockquote><p>
Lack of a robots.txt file should mean just that, they don&#8217;t know about robots so robots should STAY THE HELL OUT!
</p></blockquote>
<p>I&#8217;ll come back to this later.</p>
<p>Others have objected to the idea of putting up a roadmap to secret pages on their sites. Bertrand Meyer, the designer of Eiffel (the programming language, not the Tower) and a Very Smart Person even holds this viewpoint. To <a href="http://archive.eiffel.com/private/meyer/robots.html">quote</a>:</p>
<blockquote><p>
If you are just a bit absent-minded, isn&#8217;t it natural<br />
to use this mechanism to exclude stuff from being indexed and hence believe<br />
no one will find it? &#8220;Stupid&#8221;, maybe &#8212; but not unlikely.
</p></blockquote>
<p>Indeed, scanning through the robots.txt files I pulled down, I find disallow rules for 3,000+ &#8220;phpMyAdmin&#8221; paths, 40,000+ &#8220;stats&#8221; paths, 31,000+ &#8220;log&#8221; paths, 400+ &#8220;secret&#8221; paths, 100,000+ &#8220;admin&#8221; paths, and a host of other interesting looking entries. Even if the vast majority of these are properly secured with authentication, the chances of a few people being absent-minded, as Bertrand might say, are pretty good.</p>
<p>On the flip side of these opinions, there are those who have always viewed, and want to continue to view, robots.txt as a merely advisory standard. As courts and legislative bodies have begun to apply the force of law to this loose consensus protocol, some have <a href="http://www.eweek.com/article2/0,1759,1248105,00.asp">spoken out</a> in favor of information transparency and the essential openness of the Internet, including Marijn Koster, the creator of the protocol:</p>
<blockquote><p>
&#8220;I don&#8217;t think that&#8217;s in the spirit of free information exchange,&#8221; Koster says. Some robots may have legitimate reasons to ignore robot exclusion directives. For example, he says, a company might use robots to hunt for copyright infringing content.
</p></blockquote>
<h3>Methodology</h3>
<p>Having written a spider for my <a href="http://www.nextthing.org/archives/2005/08/07/fun-with-http-headers">HTTP headers survey</a> and run it against all of the domains in the <a href="http://www.dmoz.org/">Open Directory</a>, I already had a large collection of web sites, and a decent spider. I further added to my list of domains by extracting links from the pages I&#8217;d downloaded for that project. Then, I ran my spider (written in Python, using PycURL) against this expanded list of domains, attempting to retrieve the robots.txt file at each. The HTTP headers and full body of the response were stored in a MySQL database. This database was then dumped via a custom &#8220;Big File&#8221; implementation, which amounted to a bit more than 12GB on disk. Then, I wrote an analyzer which could run through this logical file, processing the records, recording interesting statistics about the entries and reporting the results. This analyzer takes about half an hour to run on the dataset. In total, I received responses from about 4.6 million unique domains.</p>
<h3>Status Codes</h3>
<p>HTTP status codes (aka response codes) tell web browsers and robots both what kind of response they&#8217;re getting when they download a page. For example, &#8220;200&#8243; means everything is okay and &#8220;404&#8243; means the web server couldn&#8217;t find the file the browser requested. The IETF robots.txt spec says that a 404 response for robots.txt means the site is unrestricted for robots, and a 2XX response means the robot must respect the returned robots.txt content. Other status codes have recommended behaviors, but they&#8217;re not required.</p>
<p>Status codes are interesting primarily because they give a quick count of how many sites have a robots.txt file. I got responses from 4.6 million sites, so by tallying the response codes of different types, I can tell who has a robots.txt file and who doesn&#8217;t:</p>
<table>
<tr>
<th>Status Code</th>
<th>Count</th>
</tr>
<tr>
<td>404</td>
<td>3,008,767</td>
</tr>
<tr>
<td>200</td>
<td>1,217,303</td>
</tr>
<tr>
<td>302</td>
<td>276,106</td>
</tr>
<tr>
<td>301</td>
<td>72,674</td>
</tr>
<tr>
<td>403</td>
<td>15,675</td>
</tr>
<tr>
<td>400</td>
<td>5,570</td>
</tr>
<tr>
<td>401</td>
<td>3,856</td>
</tr>
<tr>
<td>500</td>
<td>2,841</td>
</tr>
<tr>
<td>410</td>
<td>1,450</td>
</tr>
<tr>
<td>303</td>
<td>1,319</td>
</tr>
<tr>
<td>503</td>
<td>890</td>
</tr>
<tr>
<td>304</td>
<td>529</td>
</tr>
<tr>
<td>501</td>
<td>280</td>
</tr>
<tr>
<td>502</td>
<td>227</td>
</tr>
<tr>
<td>307</td>
<td>218</td>
</tr>
<tr>
<td>204</td>
<td>215</td>
</tr>
<tr>
<td>300</td>
<td>100</td>
</tr>
<tr>
<td>504</td>
<td>60</td>
</tr>
<tr>
<td>406</td>
<td>58</td>
</tr>
<tr>
<td>419</td>
<td>45</td>
</tr>
<tr>
<td>550</td>
<td>36</td>
</tr>
<tr>
<td>202</td>
<td>34</td>
</tr>
<tr>
<td>999</td>
<td>17</td>
</tr>
<tr>
<td>100</td>
<td>12</td>
</tr>
<tr>
<td>418</td>
<td>10</td>
</tr>
<tr>
<td>201</td>
<td>7</td>
</tr>
<tr>
<td>405</td>
<td>6</td>
</tr>
<tr>
<td>423</td>
<td>6</td>
</tr>
<tr>
<td>666</td>
<td>3</td>
</tr>
<tr>
<td>402</td>
<td>3</td>
</tr>
<tr>
<td>415</td>
<td>3</td>
</tr>
<tr>
<td>407</td>
<td>2</td>
</tr>
<tr>
<td>510</td>
<td>2</td>
</tr>
<tr>
<td>490</td>
<td>1</td>
</tr>
<tr>
<td>505</td>
<td>1</td>
</tr>
<tr>
<td>509</td>
<td>1</td>
</tr>
<tr>
<td>900</td>
<td>1</td>
</tr>
<tr>
<td>409</td>
<td>1</td>
</tr>
<tr>
<td>408</td>
<td>1</td>
</tr>
<tr>
<th>Total:</th>
<td>4,608,330</td>
</tr>
</table>
<p>Broken down by class, we get:</p>
<table>
<tr>
<th>Class</th>
<th>Count</th>
<th>% of Total</th>
</tr>
<tr>
<td>5xx</td>
<td>4,338</td>
<td>0.09</td>
</tr>
<tr>
<td>4xx</td>
<td>3,035,454</td>
<td>65.86</td>
</tr>
<tr>
<td>3xx</td>
<td>350,946</td>
<td>7.61</td>
</tr>
<tr>
<td>2xx</td>
<td>1,217,559</td>
<td>26.42</td>
</tr>
<tr>
<td>1xx</td>
<td>12</td>
<td>0.00</td>
</tr>
<tr>
<td>invalid</td>
<td>21</td>
<td>0.00</td>
</tr>
</table>
<p>As we can see above, around 65% of sites return a 4XX status code, indicating they don&#8217;t have a robots.txt file. Another 7.6% redirect to a different URL, usually either the home page or an error page. This means, essentially, that about 26% of sites are attempting to serve up a valid robots.txt file. Of course, some sites may improperly return an error page with a 2xx status code, so this is only useful as a quick estimate.</p>
<h3>MIME Types</h3>
<p>MIME types (aka content types) are returned in the headers of HTTP responses by web servers to tell clients what the document&#8217;s type is. They consist of a type (text, image, etc), a subtype (like html or jpeg) and some other optional parameters (like the character encoding). So, for example, an HTML file usually has a MIME type like &#8220;text/html&#8221; and a text file a type like &#8220;text/plain&#8221;. An image file might have a MIME type like &#8220;image/gif&#8221; or &#8220;image/jpeg&#8221;. The IANA keeps an official list of registered MIME types at <a href="http://www.iana.org/assignments/media-types/">http://www.iana.org/assignments/media-types/</a>. </p>
<p>The only MIME type that should be returned for a valid robots.txt file is text. True, the specs don&#8217;t specifically mention MIME types, but sites like Google <a href="http://www.google.com/support/webmasters/bin/answer.py?answer=35237">follow the general HTTP rule</a> of &#8220;if it&#8217;s not text/*, it&#8217;s not really plain text&#8221;. Of the robots.txt files I got back, 109,780 of them had MIME types other than text/plain. So, it should be no surprise that the big 3 search engines (Yahoo!, Google, and MSN) all will attempt to parse any text robots.txt file they get back from the server. For example, <a href="http://www.digg.com/robots.txt">Digg.com serves up</a> their robots.txt file as &#8220;text/html; charset=UTF-8&#8243;. Google, MSN, and Yahoo! all obey the rules in the file.</p>
<p>Besides for text/html and text/plain, some of the more common MIME types I got back were application/octet-stream, application/x-httpd-php, text/x-perl (mostly error pages), video/x-ms-asf, application/x-httpd-cgi, image/gif, and image/jpeg.</p>
<p>Even among files ostensibly marked as text, there were a wide variety of questionable MIME types:</p>
<table>
<tr>
<th>Count</th>
<th>Content Type</th>
</tr>
<tr>
<td>2</td>
<td>application/txt</td>
</tr>
<tr>
<td>5</td>
<td>application/x-txt</td>
</tr>
<tr>
<td>2</td>
<td>file/txt</td>
</tr>
<tr>
<td>1</td>
<td>internal-gopher-text</td>
</tr>
<tr>
<td>30</td>
<td>plain/text</td>
</tr>
<tr>
<td>12</td>
<td>text</td>
</tr>
<tr>
<td>13</td>
<td>text/R*ch</td>
</tr>
<tr>
<td>2</td>
<td>text/aleph_save</td>
</tr>
<tr>
<td>2</td>
<td>text/ascii</td>
</tr>
<tr>
<td>6</td>
<td>text/asp</td>
</tr>
<tr>
<td>36</td>
<td>text/css</td>
</tr>
<tr>
<td>2</td>
<td>text/dhtml</td>
</tr>
<tr>
<td>73</td>
<td>text/enriched</td>
</tr>
<tr>
<td>1</td>
<td>text/htm</td>
</tr>
<tr>
<td>1</td>
<td>text/illegal</td>
</tr>
<tr>
<td>1</td>
<td>text/javascript</td>
</tr>
<tr>
<td>2</td>
<td>text/octet-stream</td>
</tr>
<tr>
<td>1</td>
<td>text/plane</td>
</tr>
<tr>
<td>4</td>
<td>text/rtf</td>
</tr>
<tr>
<td>1</td>
<td>text/ssi html</td>
</tr>
<tr>
<td>3</td>
<td>text/svg</td>
</tr>
<tr>
<td>3</td>
<td>text/text</td>
</tr>
<tr>
<td>9</td>
<td>text/txt</td>
</tr>
<tr>
<td>20</td>
<td>text/vnd.wap.wml</td>
</tr>
<tr>
<td>5</td>
<td>text/x-component</td>
</tr>
<tr>
<td>87</td>
<td>text/x-invalid</td>
</tr>
<tr>
<td>1</td>
<td>text/x-log</td>
</tr>
<tr>
<td>386</td>
<td>text/x-perl</td>
</tr>
<tr>
<td>2</td>
<td>text/x-python</td>
</tr>
<tr>
<td>40</td>
<td>text/x-server-parsed-html</td>
</tr>
<tr>
<td>23</td>
<td>text/xml</td>
</tr>
<tr>
<td>11</td>
<td>txt</td>
</tr>
</table>
<h3>No, Really, Robots Dot TEXT</h3>
<p>An error similar to using the wrong content type is uploading a robots.txt file in a format other than plain text. Popular mistakes here include Word documents (examples: <a href="http://www.purpleandfinelinen.co.uk/robots.txt">1</a>, <a href="http://www.toskana-ligurien.de/robots.txt">2</a>, <a href="http://www.friesmed.ch/robots.txt">3</a>), RTF documents (examples: <a href="http://www.tierheilpraxis-domingos-home.de/robots.txt">1</a>, <a href="http://www.magnum.com.tw/robots.txt">2</a>, <a href="http://www.byweis.de/robots.txt">3</a>), and HTML. I even found LaTeX and KOffice documents.</p>
<p>One piece of server software (called Cougar, which looks, as near as I can tell, to be either Microsoft Small Business Server or IIS), even spits out ASF streaming video files when asked for a robots.txt file (examples: <a href="http://msmedia.dot.ca.gov/robots.txt">1</a>, <a href="http://vista.streamguys.com/robots.txt">2</a>). Fun.</p>
<h3>Invalid Encodings</h3>
<p><a href="http://en.wikipedia.org/wiki/Character_encoding">Character encodings</a> specify what letters and other characters correspond to which specific bits. Sites specify what character set a response is in within the Content-type header. Some sites serve up robots.txt files in little-used encodings, such as UTF-16. UTF-16 is tricky for a number of reasons, not the least of which are the different endian encodings. Of the 463 UTF-16 files I found, approximately 10% were not valid UTF-16, even though they included a UTF16 BOM.</p>
<p>Otherwise, I saw close to 300 unique character sets claimed by servers, even discarding obviously incorrect ones and making them all lower case. These included some ones I hadn&#8217;t seen before, like &#8220;nf_z_62-010&#8243;, &#8220;ibm-939&#8243;, and  &#8220;fi_fi.iso-8859-15@euro&#8221;. </p>
<h3>Comments</h3>
<p>robots.txt have one and only one proper way to comment, which is to put comments after a hash mark (#). However, I found HTML comments (<!-- -->), C++ style comments (//), and a variety of others, including simple in line comments.</p>
<h3>Totally Confused</h3>
<p>Some people seem rather befuddled as to what constitutes a robots.txt file. For example, the most common confusion I&#8217;ve found is people using the raw text dump of the <a href="http://www.robotstxt.org/wc/active.html">Web Robots Database</a> as their robots.txt file. I&#8217;m not just talking about a couple of sites, either. Approximately 1 in every 1000 websites I looked at do this. It&#8217;s really quite bizarre. This seems to be part of a more general mistake wherein people copy instructions on how to set up a robots.txt file into the contents of robots.txt files. For example, here are a few: <a href="http://www.cooljobscanada.com/robots.txt">www.cooljobscanada.com</a>, <a href="http://www.numis.co.uk/robots.txt">www.numis.co.uk</a>, <a href="http://www.volubilis2000.com/robots.txt">www.volubilis2000.com</a>, <a href="http://www.kickapoo-orchard.com/robots.txt">www.kickapoo-orchard.com</a>, <a href="http://www.aplussupply.com/robots.txt">www.aplussupply.com</a>.</p>
<p>Then there are just the random things you find. <a href="http://www.crowndiamond.org/robots.txt">Religious</a> <a href="http://www.rjatropical.com/robots.txt">texts</a> and <a href="http://www.gracefree.org/robots.txt">descriptions</a> of <a href="http://www.the-good-news.org/robots.txt">churches</a>. A <a href="http://www.handsonus.com/robots.txt">catalog for MIDI tracks</a>. </p>
<p>ASCII art, both <a href="/blog/cache/incharge.org.robots.txt">pornography</a> <a href="http://lemondrop.freeradiosaic.org/robots.txt">and</a> <a href="http://www.philippinefiesta.com/robots.txt">otherwise</a>. </p>
<p>A <a href="http://www.shugashack.com/robots.txt">list of videogames</a>. <a href="http://perfectillusions.com/robots.txt">Several</a> <a href="http://www.le-pics.de/robots.txt">.htaccess</a> <a href="http://vanderwoning.com/robots.txt">files</a>. Access <a href="http://www.nowasol.com.pl/robots.txt">logs</a>. Lists of <a href="http://www.catallix.com/robots.txt">keywords</a> <a href="http://www.tcvirtual.com/robots.txt">and</a> <a href="http://www.kreisky.org/robots.txt">website</a> <a href="http://www.businesscareers.com/robots.txt">descriptions</a>, including an actual <a href="http://www.wasser.adv.br/robots.txt" rel="nofollow">keyword stuffing example</a>. Bash scripts, PHP pages, and everything in between. </p>
<p>I even found <a href="http://mylogo.incruit.com/robots.txt">image files</a> being served for robots.txt. Not to mention <a href="http://www.jimhendersonrealty.com/robots.txt">e-mail messages</a> and <a href="http://www.orbikron.com/robots.txt">newsgroup postings</a>.</p>
<p>There&#8217;s even a description of a swimming pool. <a href="http://www.allgaeu-bad.de/robots.txt">In German</a>.</p>
<p>And, of course, plenty of human-readable <a href="/blog/cache/lije.commissions.leg.state.mn.us.robots.txt">instructions</a> to robots which can&#8217;t read them: <a href="http://www.corsicamania.com/robots.txt">http://www.corsicamania.com/robots.txt</a>.</p>
<h3>info.txt</h3>
<p><a href="http://www.aota.net/forums/archive/index.php/t-18373.html">Apparently</a> there&#8217;s <a href="http://www.varisearch.com/Webmasters/PromotingYourSite/TipSheets/Alexa.htm">another protocol</a>, similar to robots.txt, for advertising the contact information for a site. A file called info.txt is supposed to be placed in the root of the site, which sites like Alexa will look for when trying to find out who owns the domain. I found a <a href="http://www.melodilerim.com/robots.txt">lot</a> of these <a href="http://ramast.divagaciones.com/robots.txt">records</a> in the robots.txt files.</p>
<p>Someday I&#8217;ll have to see how many of these there are in the wild.</p>
<h3>Wildcards</h3>
<p>There are no wildcards (also known as pattern matching) in the official robots.txt specs, but various search engines have added extensions to support this.</p>
<p>For example, <a href="http://www.google.com/support/webmasters/bin/answer.py?answer=40367&#038;topic=8846">Google</a>, <a href="http://search.msn.com.sg/docs/siteowner.aspx?t=SEARCH_WEBMASTER_REF_RestrictAccessToSite.htm">MSN Search</a>, and <a href="http://help.yahoo.com/help/us/ysearch/slurp/slurp-02.html">Yahoo!</a> allow an asterisk (*) to match any sequence of characters, and a dollar sign ($) to match the end of the URL. So, to block spiders from downloading any JPEG image files, one might use:</p>
<pre>
User-agent: *
Disallow: /*.jpg$
</pre>
<p>Indeed, blocking spidering of certain file types is the most popular use for wildcards. Most people who are using wildcards for anything else are doing so entirely unnecessarily. For example, a lot of sites have the following rule:</p>
<pre>
Disallow: /RealEstateTips/*
</pre>
<p>The use of the non-standard wildcard above is useless, as this rule is equivalent to:</p>
<pre>
Disallow: /RealEstateTips/
</pre>
<p>This is because rules are by default partial paths, and will match any path beginning with that string. It&#8217;s also worth noting that of all the sites which have the above rule with the wildcard, none of them have the rule without the wildcard. So, a spider which didn&#8217;t support pattern matching would be free to download urls that start with &#8220;/RealEstateTips/&#8221;, so long as they didn&#8217;t have an asterisk after the second slash.</p>
<h3>Common Syntax Errors</h3>
<p>So, besides for the above, what are some of the common errors? The spec says that records are separated by blank lines, and the most common errors center around that. First most is putting a blank line between a User-agent line and the rules that should apply to it, with 74,043 files doing this. Next up is the placement of a Disallow or Allow rule with no User-agent or Disallow/Allow rule immediately before it, with 64,921 files making this mistake. The next is placing a User-agent line immediately after a Disallow/Allow line, with no space in between. 32,656 files did this. Finally, lines which were neither comments, nor blank, nor rules showed up in 22,269 files.</p>
<h3>Crawl-delay</h3>
<p>The IETF robots.txt draft spec includes a provision for extensions to the robots.txt format. Basically, along with &#8220;Allow&#8221; and &#8220;Disallow&#8221; lines, spiders can optionally support extensions for enhanced control over the robot&#8217;s behavior. The most widely-deployed of these is the Crawl-delay extension.</p>
<p>MSN Search, <a href="http://help.yahoo.com/help/us/ysearch/slurp/slurp-03.html">Yahoo!</a>, and Ask all support Crawl-delay, which is used to insert a delay between successive accesses of a web server. A typical Crawl-delay might look something like this:</p>
<pre>
User-agent: *
Crawl-delay: 5
</pre>
<p>Which spiders that support Crawl-delay would interpret as meaning they should wait 5 seconds between requests to the site. I found tens of thousands of these entries.</p>
<h3>Typos!</h3>
<p>I found a LOT of typos in these files. You wouldn&#8217;t think it would be very hard to spell the limited vocabulary of &#8220;User-agent&#8221; and &#8220;Disallow&#8221; correctly, but you&#8217;d be wrong. For example, I found <a href="/blog/cache/disallow.txt">69 typos of Disallow</a>. 69! That&#8217;s not even counting the ones I found with weird characters in the middle of the word.</p>
<h3>Fingerprinting Using robots.txt</h3>
<p>Sometimes, we can use robots.txt file contents for fingerprinting the sites that serve them up. For example, we can fingerprint the sites designed by <a href="http://www.moriah.com/">Moriah.com</a> by looking for robots.txt files with the contents:</p>
<pre>
this file placed here so you don't fill up my error log looking for it <img src='http://www.nextthing.org/wordpress/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /> 
</pre>
<p>Examples: <a href="http://www.lighthouselodging.com/">1</a>, <a href="http://www.riversedge-bb.com/">2</a>, <a href="http://www.applecountryinnbb.com/">3</a>, <a href="http://www.eastviewcountry.com/">4</a>, <a href="http://www.thefarmbnb.com/">5</a>.</p>
<p>Similarly, we can find the more than 7,000 real estate sites designed by <a href="http://www.advancedaccess.com">Advanced Access</a> by looking for the rule:</p>
<pre>
Disallow: /RealEstateTips/*
</pre>
<p>More usefully, we can identify one Korean domain squatter by looking for robots.txt files that contain only a meta tag like:</p>
<pre>
meta http-equiv=refresh content='0;url=http://www.hiplayer.com'
</pre>
<p>(brackets excluded because of a bug in WordPress).</p>
<p>At the time I spidered, we could identify another domain squatter by looking for a robots.txt file like:</p>
<pre>
User-agent: * 
Disallow: /pixel/
Disallow: /library/
Disallow: /results_monitor.asp
</pre>
<p>They&#8217;ve since switched to a more generic, but still easily-identifiable robots.txt file.</p>
<p>Using similar methods, it&#8217;s easy to find a lot more domain squatters, mass-hosted websites, etc. A search engine could potentially maintain a list of such signatures and, based solely on the robots.txt file, not bother indexing the page. Or, more generally, it could increase or decrease the relevance and ranking of the site in its search results.</p>
<h3>Conclusions</h3>
<p>Okay, so what conclusions can we draw from this mess of data? The primary conclusion, I think, is that the Robots Exclusion Protocol is more complicated than it actually seems. As a spider, in order to properly parse the variety of robots.txt files you&#8217;ll find in the wild you&#8217;ll need to write an extremely lenient parser (following the <a href="http://en.wikipedia.org/wiki/Robustness_Principle">Robustness Principle</a>), mostly ignore content types, handle a variety of character encodings (and in many cases ignore those returned by the server), detect HTML and other content returned in the guise of robots.txt files, and potentially implement multiple extensions to the accepted standard.</p>
<p>How about the position, discussed above, that spiders shouldn&#8217;t spider or download content without the explicit permission of the webmaster? Belgium has certainly <a href="http://www.out-law.com/page-7759">come down</a> on the side of requiring explicit permission. However, the evidence shows that <a href="http://www.threadwatch.org/node/10055">Google is in the right</a> on this one:</p>
<blockquote><p>
&#8220;Given the vast size of the Internet, it is impossible for a search engine to contact personally each owner of a web page to determine whether the owner desires its web page to be searched, indexed or cached&#8230; If such advanced permission was required, the internet would promplty grind to a halt,&#8221; Google&#8217;s senior counsel and head of public policy Andrew McLaughlin told the Senate Legal and Constitutional Affairs Committee.
</p></blockquote>
<p>As seen in the status codes section, if this were to happen, nearly three quarters of domains on the web would go &#8220;dark&#8221; for search engines. If these sites went dark for search engines, they would essentially be offline for the majority of web users. Such an action would be in nobody&#8217;s best interest; not the site owner&#8217;s and certainly not in those of the web-using public at large.</p>
<p>On a less serious note, it&#8217;s always interesting to see just how vast the Internet really is. Few things drive that home for me as much as seeing how varied the content people generate on the web can be.</p>
<p>So, until next time, I leave you with a quote from one of the robots.txt files I came across:</p>
<blockquote><p>
are you searching something??? <img src='http://www.nextthing.org/wordpress/wp-includes/images/smilies/icon_smile.gif' alt=':)' class='wp-smiley' />
</p></blockquote>
<p>Yes. Yes I am. And so far, every time I look, I find it.</p>
<p><b>More resources:</b></p>
<ul>
<li><a href="http://www.alexa.com/site/devcorner/samples?page=rdt">Alexa robots.txt Search example</a></li>
<li><a href="http://www.google.com/webmasters/">Google Webmaster Tools</a> &#8211; Includes a robots.txt validator.</li>
<li><a href="http://www.sun.com/robots.txt">Sun&#8217;s amusing robots.txt file</a></li>
<li>Google Blog on the Robots Exclusion Protocol: <a href="http://googleblog.blogspot.com/2007/01/controlling-how-search-engines-access.html">First Post</a>, <a href="http://googleblog.blogspot.com/2007/02/robots-exclusion-protocol.html">Second</a>.</li>
</ul>

				
				<p class="postmetadata alt">
					<small>
						This entry was posted
						 
						on Monday, March 12th, 2007 at 11:58 PM						and is filed under <a href="http://www.nextthing.org/categories/general" title="View all posts in General" rel="category tag">General</a>, <a href="http://www.nextthing.org/categories/programming" title="View all posts in Programming" rel="category tag">Programming</a>.
						You can follow any responses to this entry through the <a href='http://www.nextthing.org/archives/2007/03/12/robotstxt-adventure/feed'>RSS 2.0</a> feed. 

													You can <a href="#respond">leave a response</a>, or <a href="http://www.nextthing.org/archives/2007/03/12/robotstxt-adventure/trackback" rel="trackback">trackback</a> from your own site.

						
					</small>
				</p>

			</div>
		</div>

	
<!-- You can start editing here. -->
<div id="comment-container">
	<h3 id="comments">19 Responses to &#8220;robots.txt Adventure&#8221;</h3> 

	<ol class="commentlist">

	
		<li class="alt" id="comment-2600">
			<cite><a href='http://homepage.mac.com/simx/technonova/index.html' rel='external nofollow' class='url'>Simone Manganelli</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-2600" title="">March 13th, 2007 at 12:37 AM</a> </small>

			<p>Haha, the things you find on the internets.  I was particularly amused by the religious texts, the ASCII art, and the various misspellings of the word &#8220;disallow&#8221;.</p>
<p>Keep these entries coming; they&#8217;re really fun to read.  <img src='http://www.nextthing.org/wordpress/wp-includes/images/smilies/icon_smile.gif' alt=':)' class='wp-smiley' />   Hmm, let&#8217;s see&#8230; what can I change on my website that you&#8217;ll catch on your next internet survey&#8230;</p>

		</li>

	
	
		<li class="" id="comment-2803">
			<cite><a href='http://grumpyandfarting.blogspot.com' rel='external nofollow' class='url'>YesBut</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-2803" title="">March 19th, 2007 at 11:02 AM</a> </small>

			<p>This is the eight stop in YesBut&#8217;s tour of blog land. I arrived here by entering the key words “quick estimate” in Google Blog search.</p>
<p>I think most bloggers love to have spiders crawling all over their blog. But now I must move on using the keyword chosen at random from your blog “English factory”. If you want to know where I have come from and where the key word takes me check my blog<br />
<a href="http://grumpyandfarting.blogspot.com" rel="nofollow">http://grumpyandfarting.blogspot.com</a> on Tuesday 20th March</p>

		</li>

	
	
		<li class="alt" id="comment-9918">
			<cite><a href='http://none' rel='external nofollow' class='url'>mike vidal</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-9918" title="">August 2nd, 2007 at 1:16 PM</a> </small>

			<p>Cougar is Windows Media Server.</p>

		</li>

	
	
		<li class="" id="comment-9966">
			<cite><a href='http://planetozh.com/' rel='external nofollow' class='url'>Ozh</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-9966" title="">August 14th, 2007 at 1:35 AM</a> </small>

			<p>Awesome. Just found out your HTTP headers survey and this article, and I completely love them. I completely dig this kind of rather useless and fun statistics. More or these !<br />
Unsollicited suggestions for future surveys ? HTML meta tags, favicon.ico?, unprotected .htaccess ?</p>

		</li>

	
	
		<li class="alt" id="comment-10236">
			<cite><a href='http://www.joegrossberg.com' rel='external nofollow' class='url'>Joe Grossberg</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10236" title="">September 21st, 2007 at 7:00 PM</a> </small>

			<p>Yay, original research.</p>
<p>I&#8217;m sure many of us did at least one of these things wrong, like typing &#8220;disallwo&#8221; [sic] and not checking the spelling.</p>

		</li>

	
	
		<li class="" id="comment-10238">
			<cite>Chris Sidi</cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10238" title="">September 21st, 2007 at 9:28 PM</a> </small>

			<p>How many sites got everything &#8211; the response code, the mime-type, the syntax &#8211; &#8220;right&#8221;?</p>

		</li>

	
	
		<li class="alt" id="comment-10239">
			<cite><a href='http://openelements.info' rel='external nofollow' class='url'>Mario</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10239" title="">September 22nd, 2007 at 12:09 AM</a> </small>

			<p>Interesting post Andrew, I learned something new today. Coincidentally, I looked at the robots file on your site, it reads &#8220;# Nothing to see here. <img src='http://www.nextthing.org/wordpress/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /> &#8221;.</p>

		</li>

	
	
		<li class="" id="comment-10240">
			<cite><a href='http://pauldwaite.co.uk/' rel='external nofollow' class='url'>pauldwaite</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10240" title="">September 22nd, 2007 at 5:46 AM</a> </small>

			<p>I can&#8217;t believe people think sites shouldn&#8217;t be spidered unless they asked for it.</p>
<p>Did it ever occur to them that if they want to keep something private, maybe they shouldn&#8217;t publish it on a world-wide, public computer network?</p>
<p>It&#8217;s the internet. You have no privacy. Get over it.</p>

		</li>

	
	
		<li class="alt" id="comment-10241">
			<cite>James</cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10241" title="">September 22nd, 2007 at 6:36 AM</a> </small>

			<p>You missed the blog in <a href="http://www.webmasterworld.com/robots.txt" rel="nofollow">http://www.webmasterworld.com/robots.txt</a></p>

		</li>

	
	
		<li class="" id="comment-10245">
			<cite><a href='http://simplyauser.wordpress.com/2007/09/23/links-for-2007-09-23/' rel='external nofollow' class='url'>links for 2007-09-23 &laquo; Simply&#8230; A User</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10245" title="">September 22nd, 2007 at 5:30 PM</a> </small>

			<p>[...] nextthing.org » robots.txt Adventure (tags: web robots.txt http search spider robots standards internet google genius analysis **) [...]</p>

		</li>

	
	
		<li class="alt" id="comment-10249">
			<cite><a href='http://www.egorych.com' rel='external nofollow' class='url'>egorych</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10249" title="">September 23rd, 2007 at 3:45 AM</a> </small>

			<p>Hey, I&#8217;ve translated this article into Russian (of course you&#8217;ve got some more links <img src='http://www.nextthing.org/wordpress/wp-includes/images/smilies/icon_smile.gif' alt=':)' class='wp-smiley' /> ).<br />
This is great. I&#8217;m surprised how many sites from Dmoz have such stupid errors. They are likely to be good sites, aren&#8217;t they? It&#8217;s so hard to get into dmoz now&#8230;</p>
<p>Good job.</p>

		</li>

	
	
		<li class="" id="comment-10254">
			<cite><a href='http://www.gully.org' rel='external nofollow' class='url'>Nick Gully</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10254" title="">September 23rd, 2007 at 11:26 AM</a> </small>

			<p>I think you&#8217;re missing some important rules for robots to follow:<br />
# A robot may not injure a human being or through inaction allow a human being to come to harm.<br />
# A robot must obey the orders given it by human beings, except where such orders would conflict with the First Law<br />
# A robot must protect its own existence, as long as such protection does not conflict with the First or Second Laws.</p>

		</li>

	
	
		<li class="alt" id="comment-10258">
			<cite><a href='http://boston.conman.org/' rel='external nofollow' class='url'>Sean Conner</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10258" title="">September 24th, 2007 at 1:54 AM</a> </small>

			<p>Just one small quibble:  it&#8217;s &#8220;c-o-n-n-E-r&#8221;.</p>

		</li>

	
	
		<li class="" id="comment-10260">
			<cite><a href='http://www.cs.hmc.edu/~awooster/' rel='external nofollow' class='url'>Andrew</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10260" title="">September 24th, 2007 at 9:51 AM</a> </small>

			<p>Sorry about that Sean. I&#8217;ve fixed it in the article.</p>

		</li>

	
	
		<li class="alt" id="comment-10262">
			<cite><a href='http://blog.elisehuard.be/?p=156' rel='external nofollow' class='url'>Links on a fickle monday</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10262" title="">September 24th, 2007 at 11:56 AM</a> </small>

			<p>[...] Interesting web surveys: robots.txt and http headers (via Simon Willison). [...]</p>

		</li>

	
	
		<li class="" id="comment-10268">
			<cite><a href='http://www.maxdesign.com.au/2007/09/25/some-links-143/' rel='external nofollow' class='url'>Max Design - standards based web design, development and training &raquo; Some links for light reading (25/9/07)</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10268" title="">September 24th, 2007 at 2:51 PM</a> </small>

			<p>[...] robots.txt Adventure [...]</p>

		</li>

	
	
		<li class="alt" id="comment-10269">
			<cite><a href='http://www.kingjason.co.uk/blog' rel='external nofollow' class='url'>Jason King</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10269" title="">September 24th, 2007 at 6:37 PM</a> </small>

			<p>Oh that&#8217;s amusing, interesting and useful too.</p>
<p>I frequently do health checks on other peoples&#8217; websites but it hadn&#8217;t occured to me to check they&#8217;ve written their robots file correctly. I&#8217;ll add that to my list of checks.</p>

		</li>

	
	
		<li class="" id="comment-10274">
			<cite><a href='http://www.drweb.de/weblog/weblog/?p=894' rel='external nofollow' class='url'>robots.txt analysiert | Webmaster</a></cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-10274" title="">September 26th, 2007 at 1:08 AM</a> </small>

			<p>[...] Eine interessante Untersuchung von Andrew Wooster. Der ließ 4,6 Millionen Domains von einem selbst gebastelten Spider ansteuern, um jeweils die Datei robots.txt einzusammeln und zu analysieren. Dabei kamen nicht nur statistische Daten über Statuscodes und Mime-Typen zu Tage, auch allerlei Merkwürdigkeiten wurden ausgemacht, die auf ein sonderbares Verständnis der Datei schließen lassen. So finden sich Texte aller Art, Keywords, Logs, Listen und sogar ASCII-Kunst in einer Datei, die sich ausschließlich an Bots und Spider richtet. [...]</p>

		</li>

	
	
		<li class="alt" id="comment-26151">
			<cite>Mike Scirocco</cite> Says:
						<br />

			<small class="commentmetadata"><a href="#comment-26151" title="">February 24th, 2009 at 5:39 PM</a> </small>

			<p>Can I use a wildcard in a filename? e.g.</p>
<p><a href="http://www.domain.com/admin*.php" rel="nofollow">http://www.domain.com/admin*.php</a><br />
<a href="http://www.domain.com/*.txt" rel="nofollow">http://www.domain.com/*.txt</a></p>

		</li>

	
	
	</ol>

 


<h3 id="respond">Leave a Reply</h3>


<form action="http://www.nextthing.org/wordpress/wp-comments-post.php" method="post" id="commentform">


<p><input type="text" name="author" id="author" value="" size="22" tabindex="1" />
<label for="author"><small>Name </small></label></p>

<p><input type="text" name="email" id="email" value="" size="22" tabindex="2" />
<label for="email"><small>Mail (will not be published) </small></label></p>

<p><input type="text" name="url" id="url" value="" size="22" tabindex="3" />
<label for="url"><small>Website</small></label></p>

<p>
  Please spell "response" backwards:
  <input type="text" name="comment_turing" id="comment_turing" tabindex="4" />
  <small>(required)</small>
</p>
<!--<p><small><strong>XHTML:</strong> You can use these tags: &lt;a href=&quot;&quot; title=&quot;&quot;&gt; &lt;abbr title=&quot;&quot;&gt; &lt;acronym title=&quot;&quot;&gt; &lt;b&gt; &lt;blockquote cite=&quot;&quot;&gt; &lt;cite&gt; &lt;code&gt; &lt;del datetime=&quot;&quot;&gt; &lt;em&gt; &lt;i&gt; &lt;q cite=&quot;&quot;&gt; &lt;strike&gt; &lt;strong&gt; </small></p>-->

<p><textarea name="comment" id="comment" cols="100%" rows="10" tabindex="4"></textarea></p>

<p><input name="submit" type="submit" id="submit" tabindex="5" value="Submit Comment" />
<input type="hidden" name="comment_post_ID" value="30" />
</p>
<p style="display: none;"><input type="hidden" id="akismet_comment_nonce" name="akismet_comment_nonce" value="498e7f6e32" /></p>
</form>

<p>
<br />
</p>
</div>





	
	</div>

<hr />
<div id="footer">
	<p>
		nextthing.org is proudly powered by 
		<a href="http://wordpress.org/" rel="nofollow">WordPress</a>
		<br /><a href="feed:http://www.nextthing.org/feed">Entries (RSS)</a>
		and <a href="feed:http://www.nextthing.org/comments/feed">Comments (RSS)</a>.
		<br />&#169;2006 Andrew Wooster
		<!-- 17 queries. 1.867 seconds. -->
	</p>
</div>
</div>


<script src="http://www.google-analytics.com/urchin.js" type="text/javascript">
</script>
<script type="text/javascript">
_uacct = "UA-523190-1";
urchinTracker();
</script>

</body>
</html>