HTMLNode.m 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. //
  2. // HTMLNode.m
  3. // StackOverflow
  4. //
  5. // Created by Ben Reeves on 09/03/2010.
  6. // Copyright 2010 Ben Reeves. All rights reserved.
  7. //
  8. #import "HTMLNode.h"
  9. #import <libxml/HTMLtree.h>
  10. @implementation HTMLNode
  11. -(HTMLNode*)parent
  12. {
  13. return [[HTMLNode alloc] initWithXMLNode:_node->parent];
  14. }
  15. -(HTMLNode*)nextSibling {
  16. return [[HTMLNode alloc] initWithXMLNode:_node->next];
  17. }
  18. -(HTMLNode*)previousSibling {
  19. return [[HTMLNode alloc] initWithXMLNode:_node->prev];
  20. }
  21. void setAttributeNamed(xmlNode * node, const char * nameStr, const char * value) {
  22. char * newVal = (char *)malloc(strlen(value)+1);
  23. memcpy (newVal, value, strlen(value)+1);
  24. for(xmlAttrPtr attr = node->properties; NULL != attr; attr = attr->next)
  25. {
  26. if (strcmp((char*)attr->name, nameStr) == 0)
  27. {
  28. for(xmlNode * child = attr->children; NULL != child; child = child->next)
  29. {
  30. free(child->content);
  31. child->content = (xmlChar*)newVal;
  32. break;
  33. }
  34. break;
  35. }
  36. }
  37. }
  38. NSString * getAttributeNamed(xmlNode * node, const char * nameStr)
  39. {
  40. for(xmlAttrPtr attr = node->properties; NULL != attr; attr = attr->next)
  41. {
  42. if (strcmp((char*)attr->name, nameStr) == 0)
  43. {
  44. for(xmlNode * child = attr->children; NULL != child; child = child->next)
  45. {
  46. return [NSString stringWithCString:(void*)child->content encoding:NSUTF8StringEncoding];
  47. }
  48. break;
  49. }
  50. }
  51. return NULL;
  52. }
  53. -(NSString*)getAttributeNamed:(NSString*)name
  54. {
  55. const char * nameStr = [name UTF8String];
  56. return getAttributeNamed(_node, nameStr);
  57. }
  58. //Returns the class name
  59. -(NSString*)className
  60. {
  61. return [self getAttributeNamed:@"class"];
  62. }
  63. //Returns the tag name
  64. -(NSString*)tagName
  65. {
  66. return [NSString stringWithCString:(void*)_node->name encoding:NSUTF8StringEncoding];
  67. }
  68. -(HTMLNode*)firstChild
  69. {
  70. return [[HTMLNode alloc] initWithXMLNode:_node->children];
  71. }
  72. -(void)findChildrenWithAttribute:(const char*)attribute matchingName:(const char*)className inXMLNode:(xmlNode *)node inArray:(NSMutableArray*)array allowPartial:(BOOL)partial
  73. {
  74. xmlNode *cur_node = NULL;
  75. const char * classNameStr = className;
  76. //BOOL found = NO;
  77. for (cur_node = node; cur_node; cur_node = cur_node->next)
  78. {
  79. for(xmlAttrPtr attr = cur_node->properties; NULL != attr; attr = attr->next)
  80. {
  81. if (strcmp((char*)attr->name, attribute) == 0)
  82. {
  83. for(xmlNode * child = attr->children; NULL != child; child = child->next)
  84. {
  85. BOOL match = NO;
  86. if (!partial && strcmp((char*)child->content, classNameStr) == 0)
  87. match = YES;
  88. else if (partial && strstr ((char*)child->content, classNameStr) != NULL)
  89. match = YES;
  90. if (match)
  91. {
  92. //Found node
  93. HTMLNode * nNode = [[HTMLNode alloc] initWithXMLNode:cur_node];
  94. [array addObject:nNode];
  95. break;
  96. }
  97. }
  98. break;
  99. }
  100. }
  101. [self findChildrenWithAttribute:attribute matchingName:className inXMLNode:cur_node->children inArray:array allowPartial:partial];
  102. }
  103. }
  104. -(void)findChildTags:(NSString*)tagName inXMLNode:(xmlNode *)node inArray:(NSMutableArray*)array
  105. {
  106. xmlNode *cur_node = NULL;
  107. const char * tagNameStr = [tagName UTF8String];
  108. if (tagNameStr == nil)
  109. return;
  110. for (cur_node = node; cur_node; cur_node = cur_node->next)
  111. {
  112. if (cur_node->name && strcmp((char*)cur_node->name, tagNameStr) == 0)
  113. {
  114. HTMLNode * node = [[HTMLNode alloc] initWithXMLNode:cur_node];
  115. [array addObject:node];
  116. }
  117. [self findChildTags:tagName inXMLNode:cur_node->children inArray:array];
  118. }
  119. }
  120. -(NSArray*)findChildTags:(NSString*)tagName
  121. {
  122. NSMutableArray * array = [NSMutableArray array];
  123. [self findChildTags:tagName inXMLNode:_node->children inArray:array];
  124. return array;
  125. }
  126. -(HTMLNode*)findChildTag:(NSString*)tagName inXMLNode:(xmlNode *)node
  127. {
  128. xmlNode *cur_node = NULL;
  129. const char * tagNameStr = [tagName UTF8String];
  130. for (cur_node = node; cur_node; cur_node = cur_node->next)
  131. {
  132. if (cur_node && cur_node->name && strcmp((char*)cur_node->name, tagNameStr) == 0)
  133. {
  134. return [[HTMLNode alloc] initWithXMLNode:cur_node];
  135. }
  136. HTMLNode * cNode = [self findChildTag:tagName inXMLNode:cur_node->children];
  137. if (cNode != NULL)
  138. {
  139. return cNode;
  140. }
  141. }
  142. return NULL;
  143. }
  144. -(HTMLNode*)findChildTag:(NSString*)tagName
  145. {
  146. return [self findChildTag:tagName inXMLNode:_node->children];
  147. }
  148. -(NSArray*)children
  149. {
  150. xmlNode *cur_node = NULL;
  151. NSMutableArray * array = [NSMutableArray array];
  152. for (cur_node = _node->children; cur_node; cur_node = cur_node->next)
  153. {
  154. HTMLNode * node = [[HTMLNode alloc] initWithXMLNode:cur_node];
  155. [array addObject:node];
  156. }
  157. return array;
  158. }
  159. /*
  160. -(NSString*)description
  161. {
  162. NSString * string = [NSString stringWithFormat:@"<%s>%@\n", _node->name, [self contents]];
  163. for (HTMLNode * child in [self children])
  164. {
  165. string = [string stringByAppendingString:[child description]];
  166. }
  167. string = [string stringByAppendingString:[NSString stringWithFormat:@"<%s>\n", _node->name]];
  168. return string;
  169. }*/
  170. -(HTMLNode*)findChildWithAttribute:(const char*)attribute matchingName:(const char*)name inXMLNode:(xmlNode *)node allowPartial:(BOOL)partial
  171. {
  172. xmlNode *cur_node = NULL;
  173. const char * classNameStr = name;
  174. //BOOL found = NO;
  175. if (node == NULL)
  176. return NULL;
  177. for (cur_node = node; cur_node; cur_node = cur_node->next)
  178. {
  179. for(xmlAttrPtr attr = cur_node->properties; NULL != attr; attr = attr->next)
  180. {
  181. if (strcmp((char*)attr->name, attribute) == 0)
  182. {
  183. for(xmlNode * child = attr->children; NULL != child; child = child->next)
  184. {
  185. BOOL match = NO;
  186. if (!partial && strcmp((char*)child->content, classNameStr) == 0)
  187. match = YES;
  188. else if (partial && strstr ((char*)child->content, classNameStr) != NULL)
  189. match = YES;
  190. if (match)
  191. {
  192. return [[HTMLNode alloc] initWithXMLNode:cur_node];
  193. }
  194. }
  195. break;
  196. }
  197. }
  198. HTMLNode * cNode = [self findChildWithAttribute:attribute matchingName:name inXMLNode:cur_node->children allowPartial:partial];
  199. if (cNode != NULL)
  200. {
  201. return cNode;
  202. }
  203. }
  204. return NULL;
  205. }
  206. -(HTMLNode*)findChildWithAttribute:(NSString*)attribute matchingName:(NSString*)className allowPartial:(BOOL)partial
  207. {
  208. return [self findChildWithAttribute:[attribute UTF8String] matchingName:[className UTF8String] inXMLNode:_node->children allowPartial:partial];
  209. }
  210. -(HTMLNode*)findChildOfClass:(NSString*)className
  211. {
  212. HTMLNode * node = [self findChildWithAttribute:"class" matchingName:[className UTF8String] inXMLNode:_node->children allowPartial:NO];
  213. return node;
  214. }
  215. -(NSArray*)findChildrenWithAttribute:(NSString*)attribute matchingName:(NSString*)className allowPartial:(BOOL)partial
  216. {
  217. NSMutableArray * array = [NSMutableArray array];
  218. [self findChildrenWithAttribute:[attribute UTF8String] matchingName:[className UTF8String] inXMLNode:_node->children inArray:array allowPartial:partial];
  219. return array;
  220. }
  221. -(NSArray*)findChildrenOfClass:(NSString*)className
  222. {
  223. return [self findChildrenWithAttribute:@"class" matchingName:className allowPartial:NO];
  224. }
  225. -(id)initWithXMLNode:(xmlNode*)xmlNode
  226. {
  227. if (self = [super init])
  228. {
  229. _node = xmlNode;
  230. }
  231. return self;
  232. }
  233. -(void)appendChildContentsToString:(NSMutableString*)string inNode:(xmlNode*)node
  234. {
  235. if (node == NULL)
  236. return;
  237. xmlNode *cur_node = NULL;
  238. for (cur_node = node; cur_node; cur_node = cur_node->next)
  239. {
  240. if (cur_node->content)
  241. {
  242. [string appendString:[NSString stringWithCString:(void*)cur_node->content encoding:NSUTF8StringEncoding]];
  243. }
  244. [self appendChildContentsToString:string inNode:cur_node->children];
  245. }
  246. }
  247. -(NSString*)contents
  248. {
  249. if (_node->children && _node->children->content)
  250. {
  251. return [NSString stringWithCString:(void*)_node->children->content encoding:NSUTF8StringEncoding];
  252. }
  253. return nil;
  254. }
  255. HTMLNodeType nodeType(xmlNode * _node)
  256. {
  257. if (_node == NULL || _node->name == NULL)
  258. return HTMLUnkownNode;
  259. const char * tagName = (const char*)_node->name;
  260. if (strcmp(tagName, "a") == 0)
  261. return HTMLHrefNode;
  262. else if (strcmp(tagName, "text") == 0)
  263. return HTMLTextNode;
  264. else if (strcmp(tagName, "code") == 0)
  265. return HTMLCodeNode;
  266. else if (strcmp(tagName, "span") == 0)
  267. return HTMLSpanNode;
  268. else if (strcmp(tagName, "p") == 0)
  269. return HTMLPNode;
  270. else if (strcmp(tagName, "ul") == 0)
  271. return HTMLUlNode;
  272. else if (strcmp(tagName, "li") == 0)
  273. return HTMLLiNode;
  274. else if (strcmp(tagName, "image") == 0)
  275. return HTMLImageNode;
  276. else if (strcmp(tagName, "ol") == 0)
  277. return HTMLOlNode;
  278. else if (strcmp(tagName, "strong") == 0)
  279. return HTMLStrongNode;
  280. else if (strcmp(tagName, "pre") == 0)
  281. return HTMLPreNode;
  282. else if (strcmp(tagName, "blockquote") == 0)
  283. return HTMLBlockQuoteNode;
  284. else
  285. return HTMLUnkownNode;
  286. }
  287. -(HTMLNodeType)nodetype
  288. {
  289. return nodeType(_node);
  290. }
  291. NSString * allNodeContents(xmlNode*node)
  292. {
  293. if (node == NULL)
  294. return nil;
  295. void * contents = xmlNodeGetContent(node);
  296. if (contents)
  297. {
  298. NSString * string = [NSString stringWithCString:contents encoding:NSUTF8StringEncoding];
  299. xmlFree(contents);
  300. return string;
  301. }
  302. return @"";
  303. }
  304. -(NSString*)allContents
  305. {
  306. return allNodeContents(_node);
  307. }
  308. NSString * rawContentsOfNode(xmlNode * node)
  309. {
  310. xmlBufferPtr buffer = xmlBufferCreateSize(1000);
  311. xmlOutputBufferPtr buf = xmlOutputBufferCreateBuffer(buffer, NULL);
  312. htmlNodeDumpOutput(buf, node->doc, node, (const char*)node->doc->encoding);
  313. xmlOutputBufferFlush(buf);
  314. NSString * string = nil;
  315. if (buffer->content) {
  316. string = [[NSString alloc] initWithBytes:(const void *)xmlBufferContent(buffer) length:xmlBufferLength(buffer) encoding:NSUTF8StringEncoding];
  317. }
  318. xmlOutputBufferClose(buf);
  319. xmlBufferFree(buffer);
  320. return string;
  321. }
  322. -(NSString*)rawContents {
  323. return rawContentsOfNode(_node);
  324. }
  325. @end