/**************************************************************
功能:显示年有节点信息
***************************************************************/
void DisplayNode(WEBNODE * NodeHeader)
{
WEBNODE * TempNode;
TempNode = NodeHeader;
fprintf(stdout, "\n");
while(TempNode) {
if(!strcmp(TempNode->dir, "/")) fprintf(stdout, "\t%s:%d%s%s => %s %d\n", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page, "@")?TempNode->page:"", TempNode->file, TempNode->IsHandled);
else fprintf(stdout, "\t%s:%d/%s/%s => %s %d\n", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page, "@")?TempNode->page:"", TempNode->file, TempNode->IsHandled);
TempNode = TempNode->brother;
}
TempNode = NodeHeader;
while(TempNode) {
if(TempNode->child) DisplayNode(TempNode->child);
TempNode = TempNode->brother;
}
}
/**************************************************************
功能:处理单个节点信息
***************************************************************/
void HandOneNode(WEBNODE * node)
{
char UserAgent[1024] = "", Accept[1024] = "", AcceptLanguage[1024] = "", AcceptEncoding[1024] = "", AcceptCharset[1024] = "", KeepAlive[1024] = "", Connection[1024] = "", ContentType[1024] = "";
NodeCurr = node;
if((host=gethostbyname(NodeCurr->host))==NULL) /* get ip address by domain */
{
if(DEBUG) fprintf(stderr,"\tGethostname '%s' error, %s\n", NodeCurr->host, strerror(errno));
exit(1);
}
GetLocalAgent(UserAgent, Accept, AcceptLanguage, AcceptEncoding, AcceptCharset, KeepAlive, Connection, ContentType); /* Get client browser information */
if(strcmp(NodeCurr->dir, "/")) sprintf(request, "GET /%s/%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\nAccept: %s\r\nConnection: %s\r\n\r\n", NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"", NodeCurr->host, UserAgent, Accept, Connection);
else sprintf(request, "GET %s%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\nAccept: %s\r\nConnection: %s\r\n\r\n", NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"", NodeCurr->host, UserAgent, Accept, Connection);
DoneWithList(1);
AnalyzePage(NodeCurr);
}
/**************************************************************
功能:从字符串 src 中分析出邮件地址保存到文件
***************************************************************/
void GetEmail(char * src)
{
char * pa, * pb, * pc, *pd;
char myemail[1024] = "";
FILE * mailfp = NULL;
if((mailfp = fopen("email.txt", "a+")) == NULL) return;
pa = src;
while((pb = strchr(pa, '@'))) {
GetBeforePos(pb, &pc);
GetAfterPos(pb, &pd);
if(pc && pd && (strlen(pc) > (strlen(pd) + 3))) {
memset(myemail, 0, 1024);
memcpy(myemail, pc, strlen(pc) - strlen(pd));
if(strcmp(NodeCurr->dir, "/")) fprintf(mailfp, "%s\thttp://%s/%s/%s\n", myemail, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"");
else fprintf(mailfp, "%s\thttp://%s%s%s\n", myemail, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"");
if(*(pd + 1)) pa = pd + 1;
else break;
}
else if(*(pb + 1)) pa = pb + 1;
else break;
}
fclose(mailfp);
}
/**************************************************************
功能:从 src 中找出前面的字母、数字等内含,即 email 地址中 @ 的前面部分
***************************************************************/
void GetBeforePos(char * src, char ** d)
{
char * x;
if(src - 1) x = src - 1;
else {*d = 0; return ;}
while(x) {
if(*x >= 'a' && *x <= 'z') {x--; continue;}
else if(*x >= 'A' && *x <= 'Z') {x--; continue;}
else if(*x >= '0' && *x <= '9') {x--; continue;}
else if(*x == '.' || *x == '-' || *x == '_') {x--; continue;}
else {break;}
}
x++;
if(x) *d = x;
else *d = 0;
}
/**************************************************************
功能:从 src 中找出后面的字母、数字等内含,即 email 地址中 @ 的后面部分
***************************************************************/
void GetAfterPos(char * src, char ** d)
{
char * x;
if(src + 1) x = src + 1;
else {*d = 0; return ;}
while(x) {
if(*x >= 'a' && *x <= 'z') {x++; continue;}
else if(*x >= 'A' && *x <= 'Z') {x++; continue;}
else if(*x >= '0' && *x <= '9') {x++; continue;}
else if(*x == '.' || *x == '-' || *x == '_') {x++; continue;}
else {break;}
}
if(x) *d = x;
else *d = 0;
}
zhoulifa 回复于:2006-09-01 09:54:37
/**************************************************************
功能:从 src 中找出前面的字母、数字等内含,即一个网页地址中主机名后面的部分
***************************************************************/
void GetAfterPosWithSlash(char * src, char ** d)
{
char * x;
if(src) x = src;
else {*d = 0; return ;}
while(x) {
if(*x >= 'a' && *x <= 'z') {x++; continue;}
else if(*x >= 'A' && *x <= 'Z') {x++; continue;}
else if(*x >= '0' && *x <= '9') {x++; continue;}
else if(*x == '.' || *x == '-' || *x == '_' || *x == '=') {x++; continue;}
else if(*x == ':' || *x == '/' || *x == '?' || *x == '&') {x++; continue;}
else {break;}
}
if(x) *d = x;
else *d = 0;
}
/**************************************************************
功能:为 myanchor 分配 len 大小的内存
***************************************************************/
void GetMemory(char ** myanchor, int len)
{
if(!(*myanchor)) (*myanchor) = (char *)malloc(len + 1);
else (*myanchor) = (char *)realloc((void *)(*myanchor), len + 1);
memset((*myanchor), 0, len + 1);
}
/**************************************************************
功能:从 src 中分析出网页链接,并加入到当前节点的子节点上
***************************************************************/
void GetLink(char * src)
{
char * pa, * pb, * pc;
char * myanchor = 0;
int len = 0;
pa = src;
do {
if((pb = strstr(pa, "href='"))) {
pc = strchr(pb + 6, '\'');
len = strlen(pb + 6) - strlen(pc);
GetMemory(&myanchor, len);
memcpy(myanchor, pb + 6, len);
}
else if((pb = strstr(pa, "href=\""))) {
pc = strchr(pb + 6, '"');
len = strlen(pb + 6) - strlen(pc);
GetMemory(&myanchor, len);
memcpy(myanchor, pb + 6, len);
}
else if((pb = strstr(pa, "href="))) {
GetAfterPosWithSlash(pb + 5, &pc);
len = strlen(pb + 5) - strlen(pc);
GetMemory(&myanchor, len);
memcpy(myanchor, pb + 5, len);
}
else {goto __returnLink ;}
/*
if(DEBUG) {
if(strcmp(NodeCurr->dir, "/")) fprintf(stdout, "%s\thttp://%s/%s/%s\n", myanchor, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "`")?NodeCurr->page:"");
else fprintf(stdout, "%s\thttp://%s%s%s\n", myanchor, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "`")?NodeCurr->page:"");
}
*/
if(strlen(myanchor) > 0) AddChildNode(NodeCurr, myanchor);
if(pc + 1) pa = pc + 1;
}while(pa);
__returnLink:
return;
}
/**************************************************************
功能:为当前节点增加子节点
***************************************************************/
void AddChildNode(WEBNODE * node, char * src)
{
int WebPort, len;
char * WebHost = 0, * PageAddress = 0, * WebDir = 0, * pC = 0;
WEBNODE * NewNode;
char filename[MAXFILENAME + 1] = "";
char IsFromRoot = 0;
if(!src) return;
if(!strncasecmp(src, "mailto:", strlen("mailto:"))) return ;
if(strstr(src, ".css")) return;
if(strstr(src, ".xml")) return;
if(strstr(src, ".ico")) return;
if(strstr(src, ".jpg")) return;
if(strstr(src, ".gif")) return;
if(strstr(src, "javascript:")) return;
if(strstr(src, "+")) return;
ret = GetHost(src, &WebHost, &PageAddress, &WebPort, &WebDir);
if(ret) {
len = strlen(node->host);
GetMemory(&WebHost, len);
strcpy(WebHost, node->host);
WebPort = node->port;
IsFromRoot = !strncmp(src, "/", 1);
if(IsFromRoot && (src + 1)) Rstrchr(src + 1, '/', &pC);
else if(!IsFromRoot) Rstrchr(src, '/', &pC);
else pC = 0;
if(pC) {
if(IsFromRoot) len = strlen(src + 1) - strlen(pC);
else len = strlen(src) - strlen(pC) + strlen(node->dir) + 1;
GetMemory(&WebDir, len);
if(IsFromRoot) memcpy(WebDir, src + 1, len);
else {memcpy(WebDir, node->dir, strlen(node->dir)); strcat(WebDir, "/"); memcpy(WebDir + strlen(node->dir) + 1, src, strlen(src) - strlen(pC));}
if(pC + 1) {
len = strlen(pC + 1);
GetMemory(&PageAddress, len);
strcpy(PageAddress, pC + 1);
}
else {
len = 1;
GetMemory(&PageAddress, len);
memcpy(PageAddress, e, len);
}
}
else {
if(IsFromRoot) {
len = 1;
GetMemory(&WebDir, len);
memcpy(WebDir, e + 1, len);
len = strlen(src + 1);
GetMemory(&PageAddress, len);
memcpy(PageAddress, src + 1, len);
}
else {
len = strlen(node->dir);
GetMemory(&WebDir, len);
memcpy(WebDir, node->dir, len);
len = strlen(src);
GetMemory(&PageAddress, len);
memcpy(PageAddress, src, len);
}
}
}
ret = IsExistWeb(NodeHeader, WebHost, PageAddress, WebPort, WebDir);
if(ret) goto __ReturnAdd;
if(node->child == NULL) NewNode = node->child = (WEBNODE *)malloc(sizeof(WEBNODE));
else NodeTail->brother = NewNode = (WEBNODE *)malloc(sizeof(WEBNODE));
memset(NewNode, 0, sizeof(WEBNODE));
NewNode->host = (char *)malloc(strlen(WebHost) + 1);
memset(NewNode->host, 0, strlen(WebHost) + 1);
NewNode->page = (char *)malloc(strlen(PageAddress) + 1);
memset(NewNode->page, 0, strlen(PageAddress) + 1);
NewNode->dir = (char *)malloc(strlen(WebDir) + 1);
memset(NewNode->dir, 0, strlen(WebDir) + 1);
NewNode->file = (char *)malloc(MAXFILENAME + 1);
memset(NewNode->file, 0, MAXFILENAME + 1);
strcpy(NewNode->host, WebHost);
strcpy(NewNode->page, PageAddress);
strcpy(NewNode->dir, WebDir);
sprintf(filename, "file%05d.html", FileNumber++);
strcpy(NewNode->file, filename);
NewNode->port = WebPort;
NewNode->IsHandled = 0;
NewNode->brother = 0;
NewNode->child = 0;
NodeTail = NewNode;
__ReturnAdd:
free(WebHost); free(PageAddress); free(WebDir);
}
/**************************************************************
功能:检查是否已经处理过的网页
***************************************************************/
int IsExistWeb(WEBNODE * node, char * host, char * page, int port, char * dir)
{
WEBNODE * t;
t = node;
while(t) {
if(!strcmp(t->host, host) && !strcmp(t->page, page) && t->port == port && !strcmp(t->dir, dir)) return 1;
t = t->brother;
}
t = node;
while(t) {
if(t->child) {
ret = IsExistWeb(t->child, host, page, port, dir);
if(ret) return 2;
}
t = t->brother;
}
return 0;
}
编译这个程序:
引用:
gcc mailaddrsearch.c -o mailsearcher
输入一个网址作为参数运行一下试试吧:
引用:
./mailsearcher http://zhoulifa.bokee.com/5531748.html
程序首先找出 http://zhoulifa.bokee.com/5531748.html 页面上的邮件地址保存到当前目录下 email.txt 文件里,每行一条记录,格式为邮件地址和出现该邮件地址的网页。然后分析这个页面上出现的网页链接,把各链接作为子节点加入链表, 再去处理子节点,重复上述操作。
这只是一个示例程序,并不完善,如果要使其达到实用的目的,还需要让这个程序效率更高点,比如加入 epoll ( 在 2.4 内核中只有 select 了 ) 实现 I/O 多路复用。又比如对每个子节点实现多线程,每个线程处理一个节点。
如果对 I/O 多路复用不熟悉,您可以看一下我这篇文章 http://zhoulifa.bokee.com/5345930.html 里 关于 “[url=http://zhoulifa.bokee.com/5345930.html ] Linux 下各类TCP网络服务器的实现源代 码”。
上一页 [1] [2] [3]