1 |
/* |
2 |
* |
3 |
* bashloader |
4 |
* |
5 |
* for use with following table |
6 |
* |
7 |
* CREATE TABLE bash ( |
8 |
* id int auto_increment primary key, |
9 |
* bashid int, |
10 |
* rating int, |
11 |
* quote text, |
12 |
* KEY bash_rating(rating) |
13 |
* ); |
14 |
*/ |
15 |
|
16 |
#include <iostream> |
17 |
#include <sstream> |
18 |
#include <string> |
19 |
|
20 |
#include <stdlib.h> |
21 |
|
22 |
#include "readUrl.h" |
23 |
|
24 |
using namespace std; |
25 |
|
26 |
void cleanString(string* str, string search, string replace = "") |
27 |
{ |
28 |
unsigned int pos = 0; |
29 |
while ( (pos = str->find(search,pos)) != string::npos) |
30 |
{ |
31 |
str->replace(pos, search.size(), replace); |
32 |
pos += replace.size(); |
33 |
} |
34 |
} |
35 |
|
36 |
void parseQuote(std::string str) |
37 |
{ |
38 |
int pos, end_pos; |
39 |
string quote, str_number,str_rating; |
40 |
// cout << "-----------------------------------------------"<< endl; |
41 |
pos = str.find("#"); |
42 |
pos++; |
43 |
end_pos = str.find("<", pos); |
44 |
str_number = str.substr(pos, end_pos-pos); |
45 |
|
46 |
pos = str.find("(", end_pos); |
47 |
pos++; |
48 |
end_pos = str.find(")", pos); |
49 |
str_rating = str.substr(pos, end_pos-pos); |
50 |
pos = str.find("<p"); |
51 |
|
52 |
pos += 14; |
53 |
quote = str.substr(pos); |
54 |
|
55 |
cleanString("e, "\r"); |
56 |
cleanString("e, "&", "&"); |
57 |
cleanString("e, "<br />"); |
58 |
cleanString("e, "<", "<"); |
59 |
cleanString("e, ">", ">"); |
60 |
cleanString("e, " ", " "); |
61 |
cleanString("e, "\\", "\\\\"); |
62 |
cleanString("e, """, "\""); |
63 |
cleanString("e, "'", "\\'"); |
64 |
|
65 |
cout << "INSERT INTO bash (bashid, rating, quote) values (" << str_number << "," << str_rating << ",'" << quote << "');" << endl; |
66 |
} |
67 |
|
68 |
void parseDocument(int i) |
69 |
{ |
70 |
ostringstream url; |
71 |
url << "http://bash.org/?browse&p=" << i; |
72 |
string document = readUrl( url.str() ); |
73 |
|
74 |
unsigned int pos=0, end_pos; |
75 |
while(1) |
76 |
{ |
77 |
pos = document.find("<p class=\"quote\">",pos); |
78 |
if (pos == string::npos) |
79 |
break; |
80 |
pos += 10; |
81 |
end_pos = document.find("</p>", pos); |
82 |
|
83 |
end_pos = document.find("</p>", end_pos+4); |
84 |
|
85 |
parseQuote(document.substr(pos, end_pos-pos) ); |
86 |
} |
87 |
|
88 |
} |
89 |
|
90 |
int main() |
91 |
{ |
92 |
unsigned pos, end_pos, max; |
93 |
string document = readUrl("http://bash.org/?browse"); |
94 |
|
95 |
if (document == "") |
96 |
exit(1); |
97 |
|
98 |
pos = document.rfind("<option value="); |
99 |
pos += 15; |
100 |
end_pos = document.find("\"", pos+1); |
101 |
max = atoi( document.substr(pos, end_pos-pos).c_str() ); |
102 |
|
103 |
cout << "CREATE TABLE IF NOT EXISTS bash (" << endl; |
104 |
cout << " id int auto_increment primary key," << endl; |
105 |
cout << " bashid int," << endl; |
106 |
cout << " rating int," << endl; |
107 |
cout << " quote text," << endl; |
108 |
cout << " KEY bash_rating(rating)" << endl; |
109 |
cout << " );" << endl << endl; |
110 |
|
111 |
cout << "TRUNCATE TABLE bash;" << endl; |
112 |
//parseDocument(2); |
113 |
for (unsigned int i=1; i<= max; i++) |
114 |
{ |
115 |
cerr << "Parsing #" << i << "/" << max << endl; |
116 |
parseDocument(i); |
117 |
usleep(50*1000); //be nice to the webserver |
118 |
|
119 |
} |
120 |
} |