1 |
torben |
547 |
/* |
2 |
|
|
* |
3 |
|
|
* bashloader |
4 |
|
|
* |
5 |
|
|
* for use with following table |
6 |
|
|
* |
7 |
|
|
* CREATE TABLE bash ( |
8 |
|
|
* id int auto_increment primary key, |
9 |
|
|
* bashid int, |
10 |
|
|
* rating int, |
11 |
|
|
* quote text, |
12 |
|
|
* KEY bash_rating(rating) |
13 |
|
|
* ); |
14 |
|
|
*/ |
15 |
|
|
|
16 |
|
|
#include <iostream> |
17 |
|
|
#include <sstream> |
18 |
|
|
#include <string> |
19 |
|
|
|
20 |
|
|
#include <stdlib.h> |
21 |
|
|
|
22 |
|
|
#include "readUrl.h" |
23 |
|
|
|
24 |
|
|
using namespace std; |
25 |
|
|
|
26 |
|
|
void cleanString(string* str, string search, string replace = "") |
27 |
|
|
{ |
28 |
|
|
unsigned int pos = 0; |
29 |
|
|
while ( (pos = str->find(search,pos)) != string::npos) |
30 |
|
|
{ |
31 |
|
|
str->replace(pos, search.size(), replace); |
32 |
|
|
pos += replace.size(); |
33 |
|
|
} |
34 |
|
|
} |
35 |
|
|
|
36 |
|
|
void parseQuote(std::string str) |
37 |
|
|
{ |
38 |
|
|
int pos, end_pos; |
39 |
|
|
string quote, str_number,str_rating; |
40 |
|
|
// cout << "-----------------------------------------------"<< endl; |
41 |
|
|
pos = str.find("#"); |
42 |
|
|
pos++; |
43 |
|
|
end_pos = str.find("<", pos); |
44 |
|
|
str_number = str.substr(pos, end_pos-pos); |
45 |
|
|
|
46 |
|
|
pos = str.find("(", end_pos); |
47 |
|
|
pos++; |
48 |
|
|
end_pos = str.find(")", pos); |
49 |
|
|
str_rating = str.substr(pos, end_pos-pos); |
50 |
|
|
pos = str.find("<p"); |
51 |
|
|
|
52 |
|
|
pos += 14; |
53 |
|
|
quote = str.substr(pos); |
54 |
|
|
|
55 |
|
|
cleanString("e, "\r"); |
56 |
|
|
cleanString("e, "&", "&"); |
57 |
|
|
cleanString("e, "<br />"); |
58 |
|
|
cleanString("e, "<", "<"); |
59 |
|
|
cleanString("e, ">", ">"); |
60 |
|
|
cleanString("e, " ", " "); |
61 |
|
|
cleanString("e, "\\", "\\\\"); |
62 |
|
|
cleanString("e, """, "\""); |
63 |
|
|
cleanString("e, "'", "\\'"); |
64 |
|
|
|
65 |
|
|
cout << "INSERT INTO bash (bashid, rating, quote) values (" << str_number << "," << str_rating << ",'" << quote << "');" << endl; |
66 |
|
|
} |
67 |
|
|
|
68 |
|
|
void parseDocument(int i) |
69 |
|
|
{ |
70 |
|
|
ostringstream url; |
71 |
|
|
url << "http://bash.org/?browse&p=" << i; |
72 |
|
|
string document = readUrl( url.str() ); |
73 |
|
|
|
74 |
|
|
unsigned int pos=0, end_pos; |
75 |
|
|
while(1) |
76 |
|
|
{ |
77 |
|
|
pos = document.find("<p class=\"quote\">",pos); |
78 |
|
|
if (pos == string::npos) |
79 |
|
|
break; |
80 |
|
|
pos += 10; |
81 |
|
|
end_pos = document.find("</p>", pos); |
82 |
|
|
|
83 |
|
|
end_pos = document.find("</p>", end_pos+4); |
84 |
|
|
|
85 |
|
|
parseQuote(document.substr(pos, end_pos-pos) ); |
86 |
|
|
} |
87 |
|
|
|
88 |
|
|
} |
89 |
|
|
|
90 |
|
|
int main() |
91 |
|
|
{ |
92 |
|
|
unsigned pos, end_pos, max; |
93 |
|
|
string document = readUrl("http://bash.org/?browse"); |
94 |
|
|
|
95 |
|
|
if (document == "") |
96 |
|
|
exit(1); |
97 |
|
|
|
98 |
|
|
pos = document.rfind("<option value="); |
99 |
|
|
pos += 15; |
100 |
|
|
end_pos = document.find("\"", pos+1); |
101 |
|
|
max = atoi( document.substr(pos, end_pos-pos).c_str() ); |
102 |
|
|
|
103 |
|
|
cout << "CREATE TABLE IF NOT EXISTS bash (" << endl; |
104 |
|
|
cout << " id int auto_increment primary key," << endl; |
105 |
|
|
cout << " bashid int," << endl; |
106 |
|
|
cout << " rating int," << endl; |
107 |
|
|
cout << " quote text," << endl; |
108 |
|
|
cout << " KEY bash_rating(rating)" << endl; |
109 |
|
|
cout << " );" << endl << endl; |
110 |
|
|
|
111 |
|
|
cout << "TRUNCATE TABLE bash;" << endl; |
112 |
|
|
//parseDocument(2); |
113 |
|
|
for (unsigned int i=1; i<= max; i++) |
114 |
|
|
{ |
115 |
|
|
cerr << "Parsing #" << i << "/" << max << endl; |
116 |
|
|
parseDocument(i); |
117 |
|
|
usleep(50*1000); //be nice to the webserver |
118 |
|
|
|
119 |
|
|
} |
120 |
|
|
} |