Does that help to clarify the limitation?
Yes, it does. Thanks.
More questions follows. I see that sphinx is learning
only attachments with ContentType = ‘text/plain’ entries
which looks unfortunate since I have tons of html email.
Did indexing text/html and having html_strip=1 [1]
in sphinx produce any problems that caused only text/plain
to be choosen for indexation in rt-setup-fulltext-index?
- Sphinx | Open Source Search Server
ps. here is my setup with delta indexes, could be useful for
other prople
source rt {
type = mysql
sql_host = localhost
sql_db = rt3
sql_user =
sql_pass =
sql_query_pre = SET NAMES utf8
sql_query_pre = REPLACE INTO SphinxCounters SELECT 1, MAX(id) FROM Attachments
sql_query = \
SELECT a.id, a.content FROM Attachments a \
JOIN Transactions txn ON a.TransactionId = txn.id AND txn.ObjectType = 'RT::Ticket' \
JOIN Tickets t ON txn.ObjectId = t.id \
WHERE a.ContentType = 'text/plain' AND t.Status != 'deleted' \
AND a.id<=( SELECT max_doc_id FROM SphinxCounters WHERE counter_id=1 )
sql_query_info = SELECT * FROM Attachments WHERE id=$id
}
source rt_delta : rt {
sql_query_pre = SET NAMES utf8
sql_query =
SELECT a.id, a.content FROM Attachments a
JOIN Transactions txn ON a.TransactionId = txn.id AND txn.ObjectType = ‘RT::Ticket’
JOIN Tickets t ON txn.ObjectId = t.id
WHERE a.ContentType = ‘text/plain’ AND t.Status != ‘deleted’
AND a.id>( SELECT max_doc_id FROM SphinxCounters WHERE counter_id=1 )
sql_query_info = SELECT * FROM Attachments WHERE id=$id
sql_query_post_index =
}
index rt {
source = rt
path = /var/lib/sphinx/rt.index
docinfo = extern
charset_type = utf-8
charset_table = 0…9, A…Z->a…z, a…z, U+0143->U+0144, U+0104->U+0105, U+0106->U+0107, U+0118->U+0119, U+0141->U+0142, U+00D3->U+00F3, U+015A->U+015B,
U+0179->U+017A, U+017B->U+017C, U+0105, U+0107, U+0119, U+0142, U+00F3, U+015B, U+017A, U+017C, U+0144
}
(various charset_table are required for many non-english languages)
index rt_delta : rt
{
source = rt_delta
path = /var/lib/sphinx/rt.delta.index
}
CREATE TABLE AttachmentsIndex
(
id
int(10) unsigned NOT NULL,
weight
int(11) NOT NULL,
query
varchar(3072) NOT NULL,
KEY query
(query
(255))
) ENGINE=SPHINX DEFAULT CHARSET=utf8 CONNECTION=‘sphinx://127.0.0.1:3312/rt,rt_delta’
(note, two indexes in CONNECTION)
CREATE TABLE SphinxCounters
(
counter_id
int(11) NOT NULL,
max_doc_id
int(11) NOT NULL,
PRIMARY KEY (counter_id
)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
all indexes updated daily, rt_delta updated every 15 minutes
cat /etc/cron.d/sphinx
@daily root /usr/bin/indexer --quiet --rotate --all
*/15 * * * * root /usr/bin/indexer --quiet --rotate rt_delta
Arkadiusz Miśkiewicz PLD/Linux Team
arekm / maven.pl http://ftp.pld-linux.org/