Changeset 61
- Timestamp:
- 03/07/2009 08:17:21 PM (3 years ago)
- File:
-
- 1 edited
-
trunk/WordPress/plugin/transposh/parser.php (modified) (11 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/WordPress/plugin/transposh/parser.php
r57 r61 54 54 $is_in_body = FALSE; 55 55 56 //Is current position within the channel tag, i.e. RSS feed 57 $is_in_channel = FALSE; 58 56 59 /* 57 60 * Parse the html page into tags, identify translateable string which … … 85 88 //do nothing 86 89 } 90 else if($element == '![CDATA[') 91 { 92 process_cdata_section(); 93 } 87 94 else 88 95 { 89 96 //Mark tag start position 90 97 $tag_start = $pos; 91 92 //skip to the '>' marking the end of the element 93 if ($element == "!") { 94 $pos = strpos($page, '[', $pos); 95 }else { 96 $pos = strpos($page, '>', $pos); 97 } 98 98 $pos = strpos($page, '>', $pos); 99 99 100 //Mark tag end position 100 101 $tag_end = $pos; … … 196 197 case 'div' : 197 198 case 'span': 198 //case 'description':199 //case 'content:encoded':200 logger("in case : $element",1);201 199 process_span_or_div_tag($element, $start, $end); 202 200 break; 203 case 'html': 204 process_html_tag($start, $end); 205 break; 206 case 'body': 207 case 'channel': 208 global $is_in_body; 209 $is_in_body = TRUE; 210 break; 201 case 'html': 202 process_html_tag($start, $end); 203 break; 204 case 'body': 205 global $is_in_body; 206 $is_in_body = TRUE; 207 break; 208 case 'channel': 209 global $is_in_channel; 210 $is_in_channel = TRUE; 211 break; 211 212 } 212 213 … … 297 298 $start = $pos; 298 299 299 //keep scanning till the first white space or the '>' mark 300 // TODO - for CDATA, check '[' 301 while($pos < strlen($page) && $page[$pos] != ' ' && $page[$pos] != '[' && 302 $page[$pos] != '>' && $page[$pos] != '\t') 303 { 304 $pos++; 305 } 306 300 //check first for a character data section - treat it like an element 301 if(is_word('![CDATA[')) 302 { 303 $pos += 8; //skip to end of ![CDATA[ 304 } 305 else 306 { 307 //keep scanning till the first white space or the '>' mark 308 while($pos < strlen($page) && !is_white_space($pos) && $page[$pos] != '>') 309 { 310 $pos++; 311 } 312 } 313 307 314 logger("Exit " . __METHOD__. ": $pos", 5); 308 315 return substr($page,$start, $pos - $start); … … 377 384 function process_current_tag() 378 385 { 379 global $page, $pos, $tags_list, $is_in_body; 380 381 $current_tag = end($tags_list); 382 383 logger("Enter " . __METHOD__ ." : $current_tag", 4); 384 385 //translate only elements within the body or title 386 if($is_in_body || $current_tag == 'title') 386 global $page, $pos; 387 388 logger("Enter " . __METHOD__ , 4); 389 390 //translate only known elements within the body or channel 391 if(is_translatable_section()) 387 392 { 388 393 skip_white_space(); 389 394 $start = $pos; 390 395 $page_length = strlen($page); 391 396 397 // Indicates if the html entity should break into a new translation segment. 398 $is_breaker = FALSE; 399 392 400 while($pos < $page_length && $page[$pos] != '<') 393 401 { 394 402 //will break translation unit when one of the following characters is reached: ., 395 if(is_sentence_breaker($pos)) 403 if(($end_of_entity = is_html_entity($pos, $is_breaker))) 404 { 405 //Check if should break - value has been set by the is_html_entity function 406 if($is_breaker) 407 { 408 translate_text($start); 409 $start = $end_of_entity; 410 } 411 412 //skip past entity 413 $pos = $end_of_entity; 414 } 415 else if(is_sentence_breaker($pos)) 396 416 { 397 417 translate_text($start); … … 399 419 $start = $pos; 400 420 } 401 else if(($end_of_entity = is_html_entity($pos))) 421 else 422 { 423 $pos++; 424 } 425 } 426 427 if($pos > $start) 428 { 429 translate_text($start); 430 } 431 } 432 logger("Exit" . __METHOD__ . " : $current_tag" , 4); 433 } 434 435 436 /* 437 * Translate the content of a cdata section. For now we only expect to handle it 438 * within RSS feeds. 439 */ 440 function process_cdata_section() 441 { 442 global $page, $pos; 443 444 logger("Enter " . __METHOD__ , 4); 445 446 //translate only known elements within rss feeds 447 if(is_translatable_section()) 448 { 449 skip_white_space(); 450 $start = $pos; 451 $page_length = strlen($page); 452 453 while($pos < $page_length && !is_word(']]>')) 454 { 455 //will break translation unit when one of the following characters is reached: ., 456 if(is_sentence_breaker($pos) || 457 $page[$pos] == '<' || $page[$pos] == '>') //only in cdata the < > are valid breakers as well 402 458 { 403 459 translate_text($start); 404 460 $pos++; 405 $start = $ end_of_entity;461 $start = $pos; 406 462 } 407 463 else … … 416 472 } 417 473 } 418 logger("Exit" . __METHOD__ . " : $current_tag" , 4); 419 } 420 474 logger("Exit " . __METHOD__ , 4); 475 } 476 477 /** 478 * Determines position in page marks a transaltable tag in html page or rss feed section. 479 * Return TRUE if should be translated otherwise FALSE. 480 */ 481 function is_translatable_section() 482 { 483 global $tags_list, $is_in_channel, $is_in_body; 484 $rc = FALSE; 485 $current_tag = end($tags_list); 486 487 if($is_in_body || $current_tag == 'title') 488 { 489 $rc = TRUE; 490 } 491 else if($is_in_channel && 492 ($current_tag == 'title' || $current_tag == 'description' || $current_tag == 'category')) 493 { 494 $rc = TRUE; 495 } 496 497 logger("Exit " . __METHOD__ . " $current_tag, translate: " . ($rc ? "yes" : "no"), 4); 498 return $rc; 499 } 421 500 422 501 /* … … 440 519 } 441 520 else if($page[$position] == ',' || $page[$position] == '?' || 442 $page[$position] == '(' || $page[$position] == ')' || 443 $page[$position] == '[' || $page[$position] == ']' || 444 $page[$position] == '"' || $page[$position] == '!' || 445 $page[$position] == ':' || $page[$position] == '|') 521 $page[$position] == '(' || $page[$position] == ')' || 522 $page[$position] == '[' || $page[$position] == ']' || 523 $page[$position] == '"' || $page[$position] == '!' || 524 $page[$position] == ':' || $page[$position] == '|' || 525 $page[$position] == ';' || 526 //break on numbers but not like: 3rd, 4th 527 (is_digit($position) && !is_a_to_z_character($position+1))) 446 528 { 447 529 //break the sentence into segments regardless of the next character. … … 455 537 * Determines if the current position marks the begining of an html 456 538 * entity. E.g & 457 * Return 0 if not an html entity otherwise return the position past this 458 * entity.539 * Return 0 if not an html entity otherwise return the position past this entity. In addition 540 * the $is_breaker will be set to TRUE if entity should break translation into a new segment. 459 541 * 460 542 */ 461 function is_html_entity($position )543 function is_html_entity($position, &$is_breaker) 462 544 { 463 545 global $page; 546 $is_breaker = FALSE; 464 547 if($page[$position] == "&" ) 465 548 { … … 478 561 //Don't break on ` so for our use we don't consider it an entity 479 562 //e.g. Jack`s apple 480 if( $entity == "’" || $entity == "'")481 { 482 return 0;563 if(!($entity == "’" || $entity == "'" || $entity == "'")) 564 { 565 $is_breaker = TRUE; 483 566 } 484 567 … … 593 676 return $index; 594 677 } 678 679 680 /* 681 * Check within page buffer position for the given word. 682 * param word The word to look for. 683 * param index1 Optional position within the page buffer, if not available then the current 684 * position ($pos) is used. 685 * Return TRUE if the word matches otherwise FALSE 686 */ 687 function is_word($word, $index1) 688 { 689 global $page, $pos; 690 $rc = FALSE; 691 692 if(!isset($index1)) 693 { 694 //use $pos as the default position if not specified otherwise 695 $index1 = $pos; 696 } 697 698 $index2 = 0; //position within word 699 $word_len = strlen($word); 700 $page_length = strlen($page); 701 702 while($index1 < $page_length && $index2 < $word_len) 703 { 704 if($page[$index1] == $word[$index2]) 705 { 706 $index1++; 707 $index2++; 708 } 709 else 710 { 711 break; 712 } 713 } 714 715 //check if we have full match 716 if($index2 == $word_len) 717 { 718 $rc = TRUE; 719 } 720 721 return $rc; 722 } 723 595 724 596 725 /**
Note: See TracChangeset
for help on using the changeset viewer.
