blob: 5a7e3a44eb58e323e480baeef50e9816454da252 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
|
<?xml version="1.0" encoding="UTF-8"?>
<!-- Initial basic version doing Studio and Thumb believed to have been written by C-Quel -->
<!-- Then updated by John Lockwood to scrape Title, Year, MPAA, Runtime, Rating, Votes, Plot, Actors, Directors -->
<!-- This version 1.1 dated 12/01/09 includes fix by C-Quel for processing results from Amazon to match recent change -->
<!-- Version 1.1 also now supports the Writers field -->
<scraper framework="1.0" date="2009-05-22" content="movies" name="Amazon US" thumb="amazonus.png" language="en">
<CreateSearchUrl dest="3">
<RegExp input="$$1" output="<url>http://www.amazon.com/s/ref=nb_ss_d_h_?url=search-alias%3Ddvd&amp;field-keywords=\1</url>" dest="3">
<expression noclean="1"></expression>
</RegExp>
</CreateSearchUrl>
<GetSearchResults dest="8">
<RegExp input="$$5" output="<?xml version="1.0" encoding="iso-8859-1" standalone="yes"?><results>\1</results>" dest="8">
<RegExp input="$$1" output="<entity><title>\2</title><url>\1</url></entity>" dest="5">
<expression repeat="yes" clear="yes" noclean="1">productTitle"><a href="([^"]*)">([^<]*)</a></expression>
</RegExp>
<expression clear="yes" noclean="1"></expression>
</RegExp>
</GetSearchResults>
<GetDetails clearbuffers="no" dest="3">
<RegExp input="$$5" output="<details>\1</details>" dest="3">
<RegExp input="$$1" output="<title>\1</title>" dest="5">
<expression noclean="1"><title>[Amazon.com: ]*([^:]*)</expression>
</RegExp>
<RegExp input="$$1" output="<year>\1</year>" dest="5+">
<expression trim="1">[ \[\(]([0-9]{4})[ \]\)][^<]*</span></expression>
</RegExp>
<RegExp input="$$1" output="<top250>\1</top250>" dest="5+">
<expression>Top 250: #([0-9]*)</a></expression>
</RegExp>
<RegExp input="$$9" output="<mpaa>G</mpaa>" dest="5+">
<RegExp input="$$1" output="\1" dest="9">
<expression><b>Rating: </b>[^_]*/(g)._</expression>
</RegExp>
<expression>(g)</expression>
</RegExp>
<RegExp input="$$9" output="<mpaa>PG</mpaa>" dest="5+">
<RegExp input="$$1" output="\1" dest="9">
<expression><b>Rating: </b>[^_]*/(pg)._</expression>
</RegExp>
<expression>(pg)</expression>
</RegExp>
<RegExp input="$$9" output="<mpaa>PG-13</mpaa>" dest="5+">
<RegExp input="$$1" output="\1" dest="9">
<expression><b>Rating: </b>[^_]*/(pg-13)._</expression>
</RegExp>
<expression>(pg-13)</expression>
</RegExp>
<RegExp input="$$9" output="<mpaa>R</mpaa>" dest="5+">
<RegExp input="$$1" output="\1" dest="9">
<expression><b>Rating: </b>[^_]*/(r)._</expression>
</RegExp>
<expression>(r)</expression>
</RegExp>
<RegExp input="$$9" output="<mpaa>NC-17</mpaa>" dest="5+">
<RegExp input="$$1" output="\1" dest="9">
<expression><b>Rating: </b>[^_]*/(nc-17)._</expression>
</RegExp>
<expression>(nc-17)</expression>
</RegExp>
<RegExp input="$$9" output="<mpaa>UNRATED</mpaa>" dest="5+">
<RegExp input="$$1" output="\1" dest="9">
<expression><b>Rating: </b>[^_]*/(unrated)._</expression>
</RegExp>
<expression>(unrated)</expression>
</RegExp>
<RegExp input="$$1" output="<certification>\1</certification>" dest="5+">
<expression repeat="yes">Classification:</b>[^>]*alt="([0-9]*)"</expression>
</RegExp>
<RegExp input="$$1" output="<tagline>\1</tagline>" dest="5+">
<expression><h5>Tagline:</h5>([^<]*)</expression>
</RegExp>
<RegExp input="$$1" output="<runtime>\1</runtime>" dest="5+">
<expression trim="1">Run Time:</b>[^0-9]*([^<]*)</li></expression>
</RegExp>
<RegExp input="$$1" output="<rating>\1.\2</rating><votes>\3</votes>" dest="5+">
<expression noclean="1">Average Customer Review</b>[^_]*stars-([0-9])-([0-9])[^)]*>([0-9]*) customer reviews</a>\)</expression>
</RegExp>
<RegExp input="$$1" output="<genre>\1</genre>" dest="5+">
<expression repeat="yes">"/Sections/Genres/[^/]*/">([^<]*)</a></expression>
</RegExp>
<RegExp input="$$1" output="<studio>\1</studio>" dest="5+">
<expression>Studio:</b> ([^<]*)</li></expression>
</RegExp>
<RegExp input="$$1" output="<outline>\2</outline><plot>\2</plot>" dest="5+">
<expression trim="1">Plot (Outline|Summary):</h5>([^<]*)</expression>
</RegExp>
<RegExp input="$$1" output="<plot>\1</plot>" dest="5+">
<expression trim="1"><b>Product Description</b><br /[^>]*>([^<]+)</expression>
</RegExp>
<RegExp input="$$1" output="<thumb>\101.L.jpg</thumb>" dest="5+">
<expression noclean="1">"original_image", "([^"]*)AA2[0-9]0_\.jpg"</expression>
</RegExp>
<RegExp input="$$9" output="<credits>\1</credits>" dest="5+">
<RegExp input="$$1" output="\1" dest="9">
<expression noclean="1"><b>Writers:</b> ([^\n]*</a>)</expression>
</RegExp>
<expression noclean="1" repeat="yes">[^>]*>([^<]+)</a></expression>
</RegExp>
<RegExp input="$$9" output="<director>\1</director>" dest="5+">
<RegExp input="$$1" output="\1" dest="9">
<expression noclean="1"><b>Directors:</b> ([^\n]*</a>)</expression>
</RegExp>
<expression noclean="1" repeat="yes">[^>]*>([^<]+)</a></expression>
</RegExp>
<RegExp input="$$9" output="<actor><name>\1</name></actor>" dest="5+">
<RegExp input="$$1" output="\1" dest="9">
<expression noclean="1"><b>Actors:</b> ([^\n]*</a>)</expression>
</RegExp>
<expression noclean="1" repeat="yes">[^>]*>([^<]+)</a></expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetDetails>
</scraper>
|