site-accueil-insa/matomo/core/Segment.php

665 lines
24 KiB
PHP
Raw Normal View History

2022-10-26 08:59:36 +02:00
<?php
/**
* Matomo - free/libre analytics platform
*
* @link https://matomo.org
* @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
*
*/
namespace Piwik;
use Exception;
use Piwik\API\Request;
use Piwik\ArchiveProcessor\Rules;
use Piwik\Cache as PiwikCache;
use Piwik\Container\StaticContainer;
use Piwik\DataAccess\LogQueryBuilder;
use Piwik\Plugins\SegmentEditor\SegmentEditor;
use Piwik\Segment\SegmentExpression;
use Piwik\Plugins\SegmentEditor\Model as SegmentEditorModel;
/**
* Limits the set of visits Piwik uses when aggregating analytics data.
*
* A segment is a condition used to filter visits. They can, for example,
* select visits that have a specific browser or come from a specific
* country, or both.
*
* Plugins that aggregate data stored in Piwik can support segments by
* using this class when generating aggregation SQL queries.
*
* ### Examples
*
* **Basic usage**
*
* $idSites = array(1,2,3);
* $segmentStr = "browserCode==ff;countryCode==CA";
* $segment = new Segment($segmentStr, $idSites);
*
* $query = $segment->getSelectQuery(
* $select = "table.col1, table2.col2",
* $from = array("table", "table2"),
* $where = "table.col3 = ?",
* $bind = array(5),
* $orderBy = "table.col1 DESC",
* $groupBy = "table2.col2"
* );
*
* Db::fetchAll($query['sql'], $query['bind']);
*
* **Creating a _null_ segment**
*
* $idSites = array(1,2,3);
* $segment = new Segment('', $idSites);
* // $segment->getSelectQuery will return a query that selects all visits
*
* @api
*/
class Segment
{
/**
* @var SegmentExpression
*/
protected $segmentExpression = null;
/**
* @var string
*/
protected $string = null;
/**
* @var string
*/
protected $originalString = null;
/**
* @var array
*/
protected $idSites = null;
/**
* @var Date
*/
protected $startDate = null;
/**
* @var Date
*/
protected $endDate = null;
/**
* @var LogQueryBuilder
*/
private $segmentQueryBuilder;
/**
* @var bool
*/
private $isSegmentEncoded;
/**
* Truncate the Segments to 8k
*/
const SEGMENT_TRUNCATE_LIMIT = 8192;
const CACHE_KEY = 'segmenthashes';
const SEGMENT_HAS_BUILT_CACHE_KEY ='segmenthashbuilt';
/**
* Constructor.
*
* When using segments that contain a != or !@ condition on a non visit dimension (e.g. action, conversion, ...) it
* is needed to use a subquery to get correct results. To avoid subqueries that fetch too many data it's required to
* set a startDate and/or an endDate in this case. That date will be used to limit the subquery (along with possibly
* given idSites). If no startDate and endDate is given for such a segment it will generate a query that directly
* joins the according tables, but trigger a php warning as results might be incorrect.
*
* @param string $segmentCondition The segment condition, eg, `'browserCode=ff;countryCode=CA'`.
* @param array $idSites The list of sites the segment will be used with. Some segments are
* dependent on the site, such as goal segments.
* @param Date|null $startDate start date used to limit subqueries
* @param Date|null $endDate end date used to limit subqueries
* @throws
*/
public function __construct($segmentCondition, $idSites, Date $startDate = null, Date $endDate = null)
{
$this->segmentQueryBuilder = StaticContainer::get('Piwik\DataAccess\LogQueryBuilder');
$segmentCondition = trim($segmentCondition);
if (!SettingsPiwik::isSegmentationEnabled()
&& !empty($segmentCondition)
) {
throw new Exception("The Super User has disabled the Segmentation feature.");
}
$this->originalString = $segmentCondition;
if ($startDate instanceof Date) {
$this->startDate = $startDate;
}
if ($endDate instanceof Date) {
$this->endDate = $endDate;
}
// The segment expression can be urlencoded. Unfortunately, both the encoded and decoded versions
// can usually be parsed successfully. To pick the right one, we try both and pick the one w/ more
// successfully parsed subexpressions.
$subexpressionsDecoded = 0;
try {
$this->initializeSegment(urldecode($segmentCondition), $idSites);
$subexpressionsDecoded = $this->segmentExpression->getSubExpressionCount();
} catch (Exception $e) {
// ignore
}
$subexpressionsRaw = 0;
try {
$this->initializeSegment($segmentCondition, $idSites);
$subexpressionsRaw = $this->segmentExpression->getSubExpressionCount();
} catch (Exception $e) {
// ignore
}
if ($subexpressionsRaw > $subexpressionsDecoded) {
$this->initializeSegment($segmentCondition, $idSites);
$this->isSegmentEncoded = false;
} else {
$this->initializeSegment(urldecode($segmentCondition), $idSites);
$this->isSegmentEncoded = true;
}
}
/**
* Returns the segment expression.
* @return SegmentExpression
* @api since Piwik 3.2.0
*/
public function getSegmentExpression()
{
return $this->segmentExpression;
}
/**
* @throws Exception
*/
private function getAvailableSegments()
{
// start cache
$cache = PiwikCache::getTransientCache();
//covert cache id
$cacheId = 'API.getSegmentsMetadata.'.SettingsPiwik::getPiwikInstanceId() . '.' . implode(",", $this->idSites);
//fetch cache lockId
$this->availableSegments = $cache->fetch($cacheId);
// segment metadata
if (empty($this->availableSegments)) {
$this->availableSegments = Request::processRequest('API.getSegmentsMetadata', array(
'idSites' => $this->idSites,
'_hideImplementationData' => 0,
'filter_limit' => -1,
'filter_offset' => 0,
'_showAllSegments' => 1,
), []);
$cache->save($cacheId, $this->availableSegments);
}
return $this->availableSegments;
}
private function getSegmentByName($name)
{
$segments = $this->getAvailableSegments();
foreach ($segments as $segment) {
if ($segment['segment'] == $name && !empty($name)) {
// check permission
if (isset($segment['permission']) && $segment['permission'] != 1) {
throw new NoAccessException("You do not have enough permission to access the segment " . $name);
}
return $segment;
}
}
throw new Exception("Segment '$name' is not a supported segment.");
}
/**
* @param $string
* @param $idSites
* @throws Exception
*/
protected function initializeSegment($string, $idSites)
{
// As a preventive measure, we restrict the filter size to a safe limit
$string = substr($string, 0, self::SEGMENT_TRUNCATE_LIMIT);
$this->string = $string;
if (empty($idSites)) {
$idSites = [];
} else if (!is_array($idSites)) {
$idSites = [$idSites];
}
$this->idSites = $idSites;
$segment = new SegmentExpression($string);
$this->segmentExpression = $segment;
// parse segments
$expressions = $segment->parseSubExpressions();
$expressions = $this->getExpressionsWithUnionsResolved($expressions);
// convert segments name to sql segment
// check that user is allowed to view this segment
// and apply a filter to the value to match if necessary (to map DB fields format)
$cleanedExpressions = array();
foreach ($expressions as $expression) {
$operand = $expression[SegmentExpression::INDEX_OPERAND];
$expression[SegmentExpression::INDEX_OPERAND] = $this->getCleanedExpression($operand);
$cleanedExpressions[] = $expression;
}
$segment->setSubExpressionsAfterCleanup($cleanedExpressions);
}
private function getExpressionsWithUnionsResolved($expressions)
{
$expressionsWithUnions = array();
foreach ($expressions as $expression) {
$operand = $expression[SegmentExpression::INDEX_OPERAND];
$name = $operand[SegmentExpression::INDEX_OPERAND_NAME];
$availableSegment = $this->getSegmentByName($name);
// We leave segments using !@ and != operands untouched for segments not on log_visit table as they will be build using a subquery
if (!$this->doesSegmentNeedSubquery($operand[SegmentExpression::INDEX_OPERAND_OPERATOR], $name) && !empty($availableSegment['unionOfSegments'])) {
$count = 0;
foreach ($availableSegment['unionOfSegments'] as $segmentNameOfUnion) {
$count++;
$operator = SegmentExpression::BOOL_OPERATOR_OR; // we connect all segments within that union via OR
if ($count === count($availableSegment['unionOfSegments'])) {
$operator = $expression[SegmentExpression::INDEX_BOOL_OPERATOR];
}
$operand[SegmentExpression::INDEX_OPERAND_NAME] = $segmentNameOfUnion;
$expressionsWithUnions[] = array(
SegmentExpression::INDEX_BOOL_OPERATOR => $operator,
SegmentExpression::INDEX_OPERAND => $operand
);
}
} else {
$expressionsWithUnions[] = array(
SegmentExpression::INDEX_BOOL_OPERATOR => $expression[SegmentExpression::INDEX_BOOL_OPERATOR],
SegmentExpression::INDEX_OPERAND => $operand
);
}
}
return $expressionsWithUnions;
}
private function isVisitSegment($name)
{
$availableSegment = $this->getSegmentByName($name);
if (!empty($availableSegment['unionOfSegments'])) {
foreach ($availableSegment['unionOfSegments'] as $segmentNameOfUnion) {
$unionSegment = $this->getSegmentByName($segmentNameOfUnion);
if (strpos($unionSegment['sqlSegment'], 'log_visit.') === 0) {
return true;
}
}
} else if (strpos($availableSegment['sqlSegment'], 'log_visit.') === 0) {
return true;
}
return false;
}
private function doesSegmentNeedSubquery($operator, $segmentName)
{
$requiresSubQuery = in_array($operator, [
SegmentExpression::MATCH_DOES_NOT_CONTAIN,
SegmentExpression::MATCH_NOT_EQUAL
]) && !$this->isVisitSegment($segmentName);
if ($requiresSubQuery && empty($this->startDate) && empty($this->endDate)) {
if (Development::isEnabled()) {
$e = new Exception();
Log::warning("Avoiding segment subquery due to missing start date and/or an end date. Please ensure a start date and/or end date is set when initializing a segment if it's used to build a query. Stacktrace:\n" . $e->getTraceAsString());
}
return false;
}
return $requiresSubQuery;
}
private function getInvertedOperatorForSubQuery($operator)
{
if ($operator === SegmentExpression::MATCH_DOES_NOT_CONTAIN) {
return SegmentExpression::MATCH_CONTAINS;
} else if ($operator === SegmentExpression::MATCH_NOT_EQUAL) {
return SegmentExpression::MATCH_EQUAL;
}
throw new Exception("Operator not support for subqueries");
}
/**
* Returns `true` if the segment is empty, `false` if otherwise.
*/
public function isEmpty()
{
return $this->segmentExpression->isEmpty();
}
/**
* Detects whether the Piwik instance is configured to be able to archive this segment. It checks whether the segment
* will be either archived via browser or cli archiving. It does not check if the segment has been archived. If you
* want to know whether the segment has been archived, the actual report data needs to be requested.
*
* This method does not take any date/period into consideration. Meaning a Piwik instance might be able to archive
* this segment in general, but not for a certain period if eg the archiving of range dates is disabled.
*
* @return bool
*/
public function willBeArchived()
{
if ($this->isEmpty()) {
return true;
}
$idSites = $this->idSites;
return Rules::isRequestAuthorizedToArchive()
|| Rules::isBrowserArchivingAvailableForSegments()
|| Rules::isSegmentPreProcessed($idSites, $this);
}
protected $availableSegments = array();
protected function getCleanedExpression($expression)
{
$name = $expression[SegmentExpression::INDEX_OPERAND_NAME];
$matchType = $expression[SegmentExpression::INDEX_OPERAND_OPERATOR];
$value = $expression[SegmentExpression::INDEX_OPERAND_VALUE];
$segment = $this->getSegmentByName($name);
$sqlName = $segment['sqlSegment'];
// Build subqueries for segments that are not on log_visit table but use !@ or != as operator
// This is required to ensure segments like actionUrl!@value really do not include any visit having an action containing `value`
if ($this->doesSegmentNeedSubquery($matchType, $name)) {
$operator = $this->getInvertedOperatorForSubQuery($matchType);
$stringSegment = $name . $operator . $value;
$segmentObj = new Segment($stringSegment, $this->idSites, $this->startDate, $this->endDate);
$select = 'log_visit.idvisit';
$from = 'log_visit';
$datetimeField = 'visit_last_action_time';
$where = [];
$bind = [];
if (!empty($this->idSites)) {
$where[] = "$from.idsite IN (" . Common::getSqlStringFieldsArray($this->idSites) . ")";
$bind = $this->idSites;
}
if ($this->startDate instanceof Date) {
$where[] = "$from.$datetimeField >= ?";
$bind[] = $this->startDate->toString(Date::DATE_TIME_FORMAT);
}
if ($this->endDate instanceof Date) {
$where[] = "$from.$datetimeField <= ?";
$bind[] = $this->endDate->toString(Date::DATE_TIME_FORMAT);
}
$logQueryBuilder = StaticContainer::get('Piwik\DataAccess\LogQueryBuilder');
$forceGroupByBackup = $logQueryBuilder->getForcedInnerGroupBySubselect();
$logQueryBuilder->forceInnerGroupBySubselect(LogQueryBuilder::FORCE_INNER_GROUP_BY_NO_SUBSELECT);
$query = $segmentObj->getSelectQuery($select, $from, implode(' AND ', $where), $bind);
$logQueryBuilder->forceInnerGroupBySubselect($forceGroupByBackup);
return ['log_visit.idvisit', SegmentExpression::MATCH_ACTIONS_NOT_CONTAINS, $query];
}
if ($matchType != SegmentExpression::MATCH_IS_NOT_NULL_NOR_EMPTY
&& $matchType != SegmentExpression::MATCH_IS_NULL_OR_EMPTY) {
if (isset($segment['sqlFilterValue'])) {
$value = call_user_func($segment['sqlFilterValue'], $value, $segment['sqlSegment']);
}
// apply presentation filter
if (isset($segment['sqlFilter'])) {
$value = call_user_func($segment['sqlFilter'], $value, $segment['sqlSegment'], $matchType, $name);
if(is_null($value)) { // null is returned in TableLogAction::getIdActionFromSegment()
return array(null, $matchType, null);
}
// sqlFilter-callbacks might return arrays for more complex cases
// e.g. see TableLogAction::getIdActionFromSegment()
if (is_array($value) && isset($value['SQL'])) {
// Special case: returned value is a sub sql expression!
$matchType = SegmentExpression::MATCH_ACTIONS_CONTAINS;
}
}
}
return array($sqlName, $matchType, $value);
}
/**
* Returns the segment condition.
*
* @return string
*/
public function getString()
{
return $this->string;
}
/**
* Returns a hash of the segment condition, or the empty string if the segment
* condition is empty.
*
* @return string
*/
public function getHash()
{
if (empty($this->string)) {
return '';
}
return self::getSegmentHash($this->string);
}
public static function getSegmentHash($definition)
{
$cache = Cache::getEagerCache();
$cacheKey = self::CACHE_KEY . md5($definition);
if ($cache->contains($cacheKey)) {
return $cache->fetch($cacheKey);
}
$defaultHash = md5(urldecode($definition));
// if the cache for segments already built, but this segment was not found,
// we return the default segment, this can be a segment from url or
// something like "visitorType==new"
if ($cache->contains(self::SEGMENT_HAS_BUILT_CACHE_KEY)) {
return $defaultHash;
}
// the segment hash is not built yet, let's do it
$model = new SegmentEditorModel();
$segments = $model->getAllSegmentsAndIgnoreVisibility();
foreach ($segments as $segment) {
$cacheKeyTemp = self::CACHE_KEY . md5($segment['definition']);
$cache->save($cacheKeyTemp, $segment['hash']);
$cacheKeyTemp = self::CACHE_KEY . md5(urldecode($segment['definition']));
$cache->save($cacheKeyTemp, $segment['hash']);
$cacheKeyTemp = self::CACHE_KEY . md5(urlencode($segment['definition']));
$cache->save($cacheKeyTemp, $segment['hash']);
}
$cache->save(self::SEGMENT_HAS_BUILT_CACHE_KEY, true);
// if we found the segment, return it's hash, but maybe this
// segment is not stored in the db, return the default
if ($cache->contains($cacheKey)) {
return $cache->fetch($cacheKey);
}
return $defaultHash;
}
/**
* Extend an SQL query that aggregates data over one of the 'log_' tables with segment expressions.
*
* @param string $select The select clause. Should NOT include the **SELECT** just the columns, eg,
* `'t1.col1 as col1, t2.col2 as col2'`.
* @param array|string $from Array of table names (without prefix), eg, `array('log_visit', 'log_conversion')`.
* @param false|string $where (optional) Where clause, eg, `'t1.col1 = ? AND t2.col2 = ?'`.
* @param array|string $bind (optional) Bind parameters, eg, `array($col1Value, $col2Value)`.
* @param false|string $orderBy (optional) Order by clause, eg, `"t1.col1 ASC"`.
* @param false|string $groupBy (optional) Group by clause, eg, `"t2.col2"`.
* @param int $limit Limit number of result to $limit
* @param int $offset Specified the offset of the first row to return
* @param bool $forceGroupBy Force the group by and not using a subquery. Note: This may make the query slower see https://github.com/matomo-org/matomo/issues/9200#issuecomment-183641293
* A $groupBy value needs to be set for this to work.
* @param int If set to value >= 1 then the Select query (and All inner queries) will be LIMIT'ed by this value.
* Use only when you're not aggregating or it will sample the data.
* @return string The entire select query.
*/
public function getSelectQuery($select, $from, $where = false, $bind = array(), $orderBy = false, $groupBy = false, $limit = 0, $offset = 0, $forceGroupBy = false)
{
$segmentExpression = $this->segmentExpression;
$limitAndOffset = null;
if($limit > 0) {
$limitAndOffset = (int) $offset . ', ' . (int) $limit;
}
try {
if ($forceGroupBy && $groupBy) {
$this->segmentQueryBuilder->forceInnerGroupBySubselect(LogQueryBuilder::FORCE_INNER_GROUP_BY_NO_SUBSELECT);
}
$result = $this->segmentQueryBuilder->getSelectQueryString($segmentExpression, $select, $from, $where, $bind,
$groupBy, $orderBy, $limitAndOffset);
} catch (Exception $e) {
if ($forceGroupBy && $groupBy) {
$this->segmentQueryBuilder->forceInnerGroupBySubselect('');
}
throw $e;
}
if ($forceGroupBy && $groupBy) {
$this->segmentQueryBuilder->forceInnerGroupBySubselect('');
}
return $result;
}
/**
* Returns the segment string.
*
* @return string
*/
public function __toString()
{
return (string) $this->getString();
}
/**
* Combines this segment with another segment condition, if the segment condition is not already
* in the segment.
*
* The combination is naive in that it does not take order of operations into account.
*
* @param string $segment
* @param string $operator The operator to use. Should be either SegmentExpression::AND_DELIMITER
* or SegmentExpression::OR_DELIMITER.
* @param string $segmentCondition The segment condition to add.
* @return string
* @throws Exception
*/
public static function combine($segment, $operator, $segmentCondition)
{
if (empty($segment)) {
return $segmentCondition;
}
if (empty($segmentCondition)
|| self::containsCondition($segment, $operator, $segmentCondition)
) {
return $segment;
}
return $segment . $operator . $segmentCondition;
}
private static function containsCondition($segment, $operator, $segmentCondition)
{
// check when segment/condition are of same encoding
return strpos($segment, $operator . $segmentCondition) !== false
|| strpos($segment, $segmentCondition . $operator) !== false
// check when both operator & condition are urlencoded in $segment
|| strpos($segment, urlencode($operator . $segmentCondition)) !== false
|| strpos($segment, urlencode($segmentCondition . $operator)) !== false
// check when operator is not urlencoded, but condition is in $segment
|| strpos($segment, $operator . urlencode($segmentCondition)) !== false
|| strpos($segment, urlencode($segmentCondition) . $operator) !== false
// check when segment condition is urlencoded & $segment isn't
|| strpos($segment, $operator . urldecode($segmentCondition)) !== false
|| strpos($segment, urldecode($segmentCondition) . $operator) !== false
|| $segment === $segmentCondition
|| $segment === urlencode($segmentCondition)
|| $segment === urldecode($segmentCondition);
}
public function getStoredSegmentName($idSite)
{
$segment = $this->getString();
if (empty($segment)) {
return Piwik::translate('SegmentEditor_DefaultAllVisits');
}
$availableSegments = SegmentEditor::getAllSegmentsForSite($idSite);
$foundStoredSegment = null;
foreach ($availableSegments as $storedSegment) {
if ($storedSegment['definition'] == $segment
|| $storedSegment['definition'] == urldecode($segment)
|| $storedSegment['definition'] == urlencode($segment)
|| $storedSegment['definition'] == $this->originalString
|| $storedSegment['definition'] == urldecode($this->originalString)
|| $storedSegment['definition'] == urlencode($this->originalString)
) {
$foundStoredSegment = $storedSegment;
}
}
if (isset($foundStoredSegment)) {
return $foundStoredSegment['name'];
}
return $this->isSegmentEncoded ? urldecode($segment) : $segment;
}
public function getOriginalString()
{
return $this->originalString;
}
}