=== modified file 'import_typepad.module' --- old/import_typepad.module 2009-03-27 17:34:20 +0000 +++ new/import_typepad.module 2009-03-30 15:16:15 +0000 @@ -153,16 +153,66 @@ '#options' => $users); } - $import_form["taxonomy_mappings"] = array( + // Skip taxonomy fieldset if nothing to show + if ($cats) { + $import_form["taxonomy_mappings"] = array( + "#type" => "fieldset", + "#title" => t("Taxonomy mappings") + ); + + foreach ($cats as $cat=>$count){ + $item = _import_typepad_taxonomy_list($cat); + $item['#title'] = "$cat ($count)"; + $import_form["taxonomy_mappings"]['taxonomy_'.str_replace(' ','_',$cat)] = $item; + } + } + + $import_form["tidy_content"] = array( "#type" => "fieldset", - "#title" => t("Taxonomy mappings")); - - foreach ($cats as $cat=>$count){ - $item = _import_typepad_taxonomy_list($cat); - $item['#title'] = "$cat ($count)"; - $import_form["taxonomy_mappings"]['taxonomy_'.str_replace(' ','_',$cat)] = $item; - } - + "#title" => t("Content Clean & Update"), + ); + $import_form["tidy_content"]["import_images"] = array ( + "#type" => "checkbox", + "#title" => "Copy images included in Blog posts to Drupal", + "#default_value" => false, + '#description' => t('If the IMCE module is installed, images will be copied to the IMCE image folder set for your role, otherwise to the Drupal files directory. Links are switched to the Drupal copy of the image if the copy succeeded.'), + ); + // Check that php.ini setting + if (!ini_get('allow_url_fopen')) { + $import_form["tidy_content"]["import_images"]['#description'] = t('Image importing requires allow_url_fopen enabled in php.ini. Disabled until that\'s corrected.'); + $import_form["tidy_content"]["import_images"]['#disabled'] = true; + } + + $import_form["tidy_content"]["rm_ms_garbage"] = array ( + "#type" => "checkbox", + "#title" => "Remove garbage metadata added if any posts were pasted from Microsoft Office", + "#default_value" => false, + ); + $import_form["tidy_content"]["rm_empty_paragraphs"] = array ( + "#type" => "checkbox", + "#title" => "Remove empty tag", + "#default_value" => false, + '#description' => t('Primarly for removing empty <p> tags, but will also remove any empty tag and repeat <br> tags.'), + ); + $import_form["tidy_content"]['update_teasers'] = array ( + "#type" => "checkbox", + "#title" => "Recreate teasers based on Drupal settings and cleaned up imported body", + "#default_value" => false, + '#description' => t('Only uses cleaned up version of body if appropriate checkboxes above are checked.'), + ); + // Possible filter format user can use + $options[0] = 'Default'; + foreach(filter_formats() as $format) { + $options[$format->format] = $format->name; + } + + $import_form["tidy_content"]["applied_input_format"] = array ( + "#type" => "select", + "#title" => "Set input format for imported nodes", + "#options" => $options, + '#description' => t('Note that the input format selections is based on your user account. If you are importing the blogs for another author, set in Author mappings above, they might not have permission to the same input formats. If you set the input format to one the author doesn\'t have permission for, they will not be able to edit the imported blog posts.'), + ); + $import_form["import_warning"] = array ( "#type" => "markup", "#value" => t('Importing may take a while, do not click \'Import\' more than once. To see progress, look at the administer content page in a new window.

')); @@ -348,7 +398,7 @@ foreach ($blogCats as $id=>$val){ $c[] = $taxonomy['taxonomy_'.str_replace(' ','_',$val)]; } - _import_typepad_save($currentBlog, $c); + _import_typepad_save($currentBlog, $c, $authors['import_images'], $authors['applied_input_format'], $authors['rm_ms_garbage'], $authors['rm_empty_paragraphs'], $authors['update_teasers']); $success++; } else if ($previewCount < $preview){ $output .= node_view($currentBlog); @@ -421,7 +471,7 @@ foreach ($blogCats as $id=>$val){ $c[] = $taxonomy['taxonomy_'.str_replace(' ','_',$val)]; } - _import_typepad_save($currentBlog, $c); + _import_typepad_save($currentBlog, $c, $authors['import_images'], $authors['applied_input_format'], $authors['rm_ms_garbage'], $authors['rm_empty_paragraphs'], $authors['update_teasers']); $success++; } else if ($previewCount < $preview){ $output .= node_view($currentBlog); @@ -446,9 +496,103 @@ } /** + * Parse for image urls & copy to Drupal + */ +function _import_typepad_import_images($body) { + if (module_exists('imce')) { + $conf = imce_settings_user(); + $conf['allowedExt'] = $GLOBALS['imce_ext']; + } else { + $conf['dir'] = file_directory_path(); + $conf['allowedExt'] = array('.jpg', '.gif', '.png'); + } + + // finds any sets, copies both the href & src; only works for absolute + // paths, either starting with http or /; this prevents any bad images from causing + // an post import failure + preg_match_all("%href=\"((http:|/)[^\"]+)[^>]+>]+> regardless of link tag to ensure any unlinkded images are also imported, copies + // src only; only works for absolute paths, either starting with http or /; this prevents any bad + // images from causing an post import failure + preg_match_all("% $ext)), 'error'); + return FALSE; + } + } else { + // Really just a guess; proper way to handle it would be to copy file to temp directory, check + // file type of temp file to pick intelligent extension, then move to permenant location + $ext = '.jpg'; + } + + //clear filename and copy file + $cleared = preg_replace("/[^\w\-\_]/", '_', substr($file_name, 0)) . $ext; + $newpath = file_create_filename($cleared, $_SERVER['DOCUMENT_ROOT'] . base_path() . $conf['dir']); + // Need doc root here, but not in array below + if (!@copy($url, $newpath)) { + drupal_set_message(t('Error copying %file.', array('%file' => $url)), 'error'); + } else { + @chmod($newpath, 0664); + $copied++; + $source_files[] = $url; + // Strip the doc root so that URL references are absolute from wwwroot. Doc root path + // was needed to copy & chmod above, as well as check for collisions in file names + $dest_files[] = substr($newpath, strlen($_SERVER['DOCUMENT_ROOT'])); + } + } + + $body = str_replace($source_files, $dest_files, $body); + drupal_set_message(t('%num images copied succesfully', array('%num' => $copied))); + return $body; +} + +/** + * Clean up MS garbage & empty tags - repeat BR tags too, but there's a comment on + * how to disable that if you prefer + */ +function _import_typepad_rm_garbage($body, $rmGarbage, $rmEmptyP) { + $garbage = array(); + + if ($rmGarbage) { + $garbage[] = "||s"; + $garbage[] = "| class=\"MsoNormal[^\"]{0,}\"|s"; + } + if ($rmEmptyP) { + // This one would remove all tags containing just one encoded char, probably + // OK, but real culprit is nbsp + //$garbage[] = "%<[^/>]+>(&[a-z0-9]{2,4};|)]+>%"; + $garbage[] = "%<[^/>]+>( |)]+>%"; + + // Comment out this next line if you don't want to include repeat
tags + $garbage[] = "%(\s?){2,}%"; + } + if ($garbage) { + $body = preg_replace($garbage, '', $body); + } + return $body; +} + +/** * Save the content into the database, first the blogs, then the comments */ -function _import_typepad_save($currentBlog, $terms){ +function _import_typepad_save($currentBlog, $terms, $cpImages, $format, $rmGarbage, $rmEmptyP, $updateTeaser){ if ($currentBlog != null){ // Apply all the substitutions $subReplace = $_REQUEST['import_typepad_substitution_replace']; @@ -460,6 +604,24 @@ $currentBlog->body = str_replace($val, $subWith[$id], $currentBlog->body); } + if ($rmGarbage || $rmEmptyP) { + $currentBlog->body = _import_typepad_rm_garbage($currentBlog->body, $rmGarbage, $rmEmptyP); + $currentBlog->teaser = _import_typepad_rm_garbage($currentBlog->teaser, $rmGarbage, $rmEmptyP); + } + + if ($cpImages) { + $currentBlog->body = _import_typepad_import_images($currentBlog->body); + } + + // If specified, use format_format + if ($format > 0) { + $currentBlog->format = $format; + } + + if ($drupalTeaser) { + $currentBlog->teaser = node_teaser($currentBlog->body, $currentBlog->format); + } + // Save the entry node_save($currentBlog);