? contrib/.svn
? contrib/apachesolr_attachments/.svn
? contrib/apachesolr_image/.svn
? contrib/apachesolr_lang/.svn
? contrib/apachesolr_mlt/.svn
? contrib/apachesolr_multisitesearch/.svn
? contrib/apachesolr_nodeaccess/.svn
? contrib/apachesolr_nodeaccess/tests/.svn
Index: apachesolr.module
===================================================================
RCS file: /cvs/drupal-contrib/contributions/modules/apachesolr/apachesolr.module,v
retrieving revision 1.1.2.12.2.67
diff -u -p -r1.1.2.12.2.67 apachesolr.module
--- apachesolr.module	4 Dec 2008 13:54:12 -0000	1.1.2.12.2.67
+++ apachesolr.module	4 Dec 2008 16:45:58 -0000
@@ -420,6 +420,22 @@ function apachesolr_add_node_document(&$
 }
 
 /**
+ * Strip control characters that cause Jetty/Solr to fail.
+ */
+function apachesolr_strip_ctl_chars($text) {
+  // See:  http://w3.org/International/questions/qa-forms-utf-8.html 
+  // Printable utf-8 does not include any of these chars below x7F
+  return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', '', $text);
+}
+
+/**
+ * Strip html tags and also control characters that cause Jetty/Solr to fail.
+ */
+function apachesolr_clean_text($text) {
+  return strip_tags(preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', '', $text));
+}
+
+/**
  * Given a node ID, return a document representing that node.
  */
 function apachesolr_node_to_document($nid) {
@@ -434,6 +450,7 @@ function apachesolr_node_to_document($ni
     $node->build_mode = NODE_BUILD_SEARCH_INDEX;
     $node = node_build_content($node, FALSE, FALSE);
     $node->body = drupal_render($node->content);
+    $node->title = apachesolr_clean_text($node->title);
 
     $text = check_plain($node->title) . ' ' . $node->body;
 
@@ -451,11 +468,11 @@ function apachesolr_node_to_document($ni
     $document->status = $node->status;
     $document->uid = $node->uid;
     $document->title = $node->title;
-    $document->body  = $node->body;
+    $document->body  = apachesolr_clean_text($node->body);
     $document->type  = $node->type;
     $document->changed = $node->changed;
     $document->comment_count = $node->comment_count;
-    $document->name = $node->name;
+    $document->name = apachesolr_strip_ctl_chars($node->name);
     $document->language = $node->language;
 
     // Path aliases can have important information about the content.
@@ -487,10 +504,10 @@ function apachesolr_node_to_document($ni
           // Don't index NULLs or empty strings
           if (isset($value['safe']) && strlen($value['safe'])) {
             if ($cck_info['multiple']) {
-              $document->setMultiValue($index_key, $value['safe']);
+              $document->setMultiValue($index_key, apachesolr_strip_ctl_chars($value['safe']));
             }
             else {
-              $document->$index_key = $value['safe'];
+              $document->$index_key = apachesolr_strip_ctl_chars($value['safe']);
             }
           }
         }
@@ -498,6 +515,7 @@ function apachesolr_node_to_document($ni
     }
 
     // This is the string value of the title. Used for sorting.
+    // TODO  - use Solr copyfield directive.
     $document->stitle = $node->title;
 
     if (is_array($node->taxonomy)) {
@@ -514,12 +532,12 @@ function apachesolr_node_to_document($ni
           $document->setMultiValue('tid', $ancestor->tid);
           $document->setMultiValue('imfield_vid'. $ancestor->vid, $ancestor->tid);
           $document->setMultiValue('vid', $ancestor->vid);
-          $document->setMultiValue('taxonomy_name', $ancestor->name);
+          $document->setMultiValue('taxonomy_name', apachesolr_strip_ctl_chars($ancestor->name));
           $text .= ' ' . $ancestor->name;
         }
       }
     }
-    $document->text = $text;
+    $document->text = apachesolr_clean_text($text);
 
     // Let modules add to the document
     foreach (module_implements('apachesolr_update_index') as $module) {
Index: contrib/apachesolr_attachments/apachesolr_attachments.module
===================================================================
RCS file: /cvs/drupal-contrib/contributions/modules/apachesolr/contrib/apachesolr_attachments/apachesolr_attachments.module,v
retrieving revision 1.1.2.6
diff -u -p -r1.1.2.6 apachesolr_attachments.module
--- contrib/apachesolr_attachments/apachesolr_attachments.module	2 Dec 2008 23:44:00 -0000	1.1.2.6
+++ contrib/apachesolr_attachments/apachesolr_attachments.module	4 Dec 2008 16:45:58 -0000
@@ -236,8 +236,8 @@ function _asa_get_attachment_text($file)
   $cleaned_text = iconv("utf-8", "utf-8//IGNORE", $text);
 
   // As per robertDouglass - http://drupal.org/node/335871
-  // Bad control character. Do we need to make a hook for text cleanup?
-  $cleaned_text = preg_replace('/\x0C/', '', $cleaned_text);
+  // Strip bad control characters. Do we need to make a hook for text cleanup?
+  $cleaned_text = apachesolr_strip_ctl_chars($cleaned_text);
   
   return $cleaned_text;
 }
