1.20.x path.inc path_clean_string($string, array $options = array())

Clean up a string segment to be used in an URL alias.

Performs the following possible alterations:

  • Remove all HTML tags.
  • Process the string through the transliteration module.
  • Replace or remove punctuation with the separator character.
  • Remove back-slashes.
  • Replace non-ascii and non-numeric characters with the separator.
  • Remove common words.
  • Replace whitespace with the separator character.
  • Trim duplicate, leading, and trailing separators.
  • Convert to lower-case.
  • Shorten to a desired length and logical position based on word boundaries.

This function should *not* be called on URL alias or path strings because it is assumed that they are already clean.

Parameters

$string: A string to clean.

array $options: (optional) A keyed array of settings and flags to control the path pattern string replacement process. Supported options are:

  • langcode: A language code to be used when translating strings.

Return value

The cleaned string.:

File

modules/path/path.inc, line 98
Miscellaneous functions for Path module.

Code

function path_clean_string($string, array $options = array()) {
  // Use the advanced backdrop_static() pattern, since this is called very often.
  static $backdrop_static_fast;
  if (!isset($backdrop_static_fast)) {
    $backdrop_static_fast['cache'] = &backdrop_static(__FUNCTION__);
  }
  $cache = &$backdrop_static_fast['cache'];

  // Generate and cache variables used in this function so that on the second
  // call to path_clean_string() we focus on processing.
  if (!isset($cache)) {
    $config = config('path.settings');
    $cache = array(
      'separator' => $config->get('separator'),
      'strings' => array(),
      'transliterate' => $config->get('transliterate'),
      'punctuation' => array(),
      'reduce_ascii' => (bool) $config->get('reduce_ascii'),
      'ignore_words_regex' => FALSE,
      'lowercase' => (bool) $config->get('case'),
      'maxlength' => min($config->get('max_component_length'), _path_get_schema_alias_maxlength()),
    );

    // Generate and cache the punctuation replacements for strtr().
    $punctuation = path_punctuation_chars();
    foreach ($punctuation as $name => $details) {
      $action = $config->get('punctuation_' . $name);
      switch ($action) {
        case PATH_PUNCTUATION_REMOVE:
          $cache['punctuation'][$details['value']] = '';
          break;
        case PATH_PUNCTUATION_REPLACE:
          $cache['punctuation'][$details['value']] = $cache['separator'];
          break;
        case PATH_PUNCTUATION_DO_NOTHING:
          // Literally do nothing.
          break;
      }
    }

    // Copy settings from hyphen, single, and double-quotes to simple versions.
    $fancy_character_map = array(
      '‘' => '\'', // Single opening curly quote.
      '’' => '\'', // Single closing curly quote.
      '‚' => '\'', // Single low-quote.
      '′' => '"', // Single prime.
      '“' => '"', // Double opening curly quote.
      '”' => '"', // Double closing curly quote.
      '„' => '"', // Double low-quote.
      '″' => '"', // Double prime.
      '–' => '-', // En dash.
      '—' => '-', // Em dash.
    );
    foreach ($fancy_character_map as $fancy_character => $simple_character) {
      if (!isset($cache['punctuation'][$fancy_character])) {
        $cache['punctuation'][$fancy_character] = $cache['punctuation'][$simple_character];
      }
    }

    // Generate and cache the ignored words regular expression.
    $ignore_words = $config->get('ignore_words');
    $ignore_words_regex = preg_replace(array('/^[,\s]+|[,\s]+$/', '/[,\s]+/'), array('', '\b|\b'), $ignore_words);
    if ($ignore_words_regex) {
      $cache['ignore_words_regex'] = '\b' . $ignore_words_regex . '\b';
      if (function_exists('mb_eregi_replace')) {
        $cache['ignore_words_callback'] = 'mb_eregi_replace';
      }
      else {
        $cache['ignore_words_callback'] = 'preg_replace';
        $cache['ignore_words_regex'] = '/' . $cache['ignore_words_regex'] . '/i';
      }
    }

    // Remove to prevent any unintentional use of $config outside of the cache.
    unset($config);
  }

  // Empty strings do not need any proccessing.
  if ($string === '' || $string === NULL) {
    return '';
  }

  $langcode = LANGUAGE_NONE;
  if (!empty($options['language']->langcode)) {
    $langcode = $options['language']->langcode;
  }
  elseif (!empty($options['langcode'])) {
    $langcode = $options['langcode'];
  }
  if ($langcode == LANGUAGE_NONE) {
    // Paths for language neutral content get transliterated according to
    // current language.
    global $language;
    // We are intentionally not using config_get('system.core', 'language_default')
    // here. That can have unexpected behavior in cases such as on a multilingual
    // site with language-neutral content types.
    $langcode = $language->langcode;
  }

  // Check if the string has already been processed, and if so return the
  // cached result.
  if (isset($cache['strings'][$langcode][$string])) {
    return $cache['strings'][$langcode][$string];
  }

  // Remove all HTML tags from the string.
  $output = strip_tags(decode_entities($string));

  // Optionally transliterate (by running through the Transliteration module).
  if ($cache['transliterate']) {
    // If the reduce strings to letters and numbers is enabled, don't bother
    // replacing unknown characters with a question mark. Use an empty string
    // instead.
    include_once BACKDROP_ROOT . '/core/includes/transliteration.inc';
    $output = transliteration_get($output, $cache['reduce_ascii'] ? '' : '?', $langcode);
  }

  // Replace or drop punctuation based on user settings.
  $output = strtr($output, $cache['punctuation']);

  // Reduce strings to letters and numbers.
  if ($cache['reduce_ascii']) {
    $output = preg_replace('/[^a-zA-Z0-9\/]+/', $cache['separator'], $output);
  }

  // Get rid of words that are on the ignore list.
  if ($cache['ignore_words_regex']) {
    $words_removed = $cache['ignore_words_callback']($cache['ignore_words_regex'], '', $output);
    if (backdrop_strlen(trim($words_removed)) > 0) {
      $output = $words_removed;
    }
  }

  // Always replace whitespace with the separator.
  $output = preg_replace('/\s+/', $cache['separator'], $output);

  // Trim duplicates and remove trailing and leading separators.
  $output = _path_clean_separators($output, $cache['separator']);

  // Optionally convert to lower case.
  if ($cache['lowercase']) {
    $output = backdrop_strtolower($output);
  }

  // Shorten to a logical place based on word boundaries.
  $output = truncate_utf8($output, $cache['maxlength'], TRUE);

  // Cache this result in the static array.
  $cache['strings'][$langcode][$string] = $output;

  return $output;
}