ADDRESS = {Berkeley, CA},
  AUTHOR = {Paul Anderson and Le Zhang},
  BOOKTITLE = {Proceedings of the Large Installations Systems Administration (LISA) Conference},
  MONTH = {November},
  ORGANIZATION = {Usenix Association},
  TITLE = {Fast and Secure Laptop Backups with Encrypted De-duplication},
  URL = {http://homepages.inf.ed.ac.uk/dcspaul/publications/lisa2010.pdf},
  YEAR = {2010}

  AUTHOR = {Garner, Philip N. and Dines, John and Hain, Thomas
                  and El Hannani, Asmaa and Karafi{\'a}t, Martin and
                  Korchagin, Danil and Lincoln, Mike and Wan, Vincent
                  and Zhang, Le},
  TITLE = {Real-Time {ASR} from Meetings},
  BOOKTITLE = {Proceedings of Interspeech},
  YEAR = 2009,
  MONTH = {September},
  ADDRESS = {{B}righton, {UK}},
  PDF = {http://publications.idiap.ch/downloads/papers/2009/Garner_INTERSPEECH_2009.pdf}

  AUTHOR = {Le Zhang},
  TITLE = {Modelling Speech Dynamics with Trajectory-{HMM}s},
  SCHOOL = {School of Informatics, University of Edinburgh},
  YEAR = 2009,
  CATEGORIES = {speech recognition, speech synthesis, MOCHA, trajectory HMM},
  MONTH = {January},
The conditional independence assumption imposed
by the hidden Markov models (HMMs) makes it difficult to model temporal
correlation patterns in human speech. Traditionally, this limitation
is circumvented by appending the first and second-order regression
coefficients to the observation feature vectors. Although this leads
to improved performance in recognition tasks, we argue that a straightforward
use of dynamic features in HMMs will result in an inferior model,
due to the incorrect handling of dynamic constraints. In this thesis
I will show that an HMM can be transformed into a Trajectory-HMM capable
of generating smoothed output mean trajectories, by performing a per-utterance
normalisation. The resulting model can be trained by either maximising
model log-likelihood or minimising mean generation errors on the training
data. To combat the exponential growth of paths in searching, the
idea of delayed path merging is proposed and a new time-synchronous
decoding algorithm built on the concept of token-passing is designed
for use in the recognition task. The Trajectory-HMM brings a new way
of sharing knowledge between speech recognition and synthesis components,
by tackling both problems in a coherent statistical framework. I evaluated
the Trajectory-HMM on two different speech tasks using the speaker-dependent
MOCHA-TIMIT database. First as a generative model to recover articulatory
features from speech signal, where the Trajectory-HMM was used in
a complementary way to the conventional HMM modelling techniques,
within a joint Acoustic-Articulatory framework. Experiments indicate
that the jointly trained acoustic-articulatory models are more accurate
(having a lower Root Mean Square error) than the separately trained
ones, and that Trajectory-HMM training results in greater accuracy
compared with conventional Baum-Welch parameter updating. In addition,
the Root Mean Square (RMS) training objective proves to be consistently
better than the Maximum Likelihood objective. However, experiment
of the phone recognition task shows that the MLE trained Trajectory-HMM,
while retaining attractive properties of being a proper generative
model, tends to favour over-smoothed trajectories among competing
hypothesises, and does not perform better than a conventional HMM.
We use this to build an argument that models giving a better fit on
training data may suffer a reduction of discrimination by being too
faithful to the training data. Finally, experiments on using triphone
models show that increasing modelling detail is an effective way to
leverage modelling performance with little added complexity in training.
  PDF = {http://www.cstr.ed.ac.uk/downloads/publications/2009/zhangle_thesis.pdf}

  AUTHOR = {Le Zhang and Steve Renals},
  TITLE = {Acoustic-Articulatory Modelling with the Trajectory {HMM}},
  JOURNAL = {IEEE Signal Processing Letters},
  YEAR = 2008,
  VOLUME = 15,
  PAGES = {245-248},
  PDF = {http://www.cstr.ed.ac.uk/downloads/publications/2008/zhang-spl.pdf},
  CATEGORIES = {articulatory inversion},
  ABSTRACT = { 	
In this letter, we introduce an hidden Markov model (HMM)-based inversion system to recovery articulatory movements from speech acoustics. Trajectory HMMs are used as generative models for modelling articulatory data. Experiments on the MOCHA-TIMIT corpus indicate that the jointly trained acoustic-articulatory models are more accurate (lower RMS error) than the separately trained ones, and that trajectory HMM training results in greater accuracy compared with conventional maximum likelihood HMM training. Moreover, the system has the ability to synthesize articulatory movements directly from a textual representation.

  AUTHOR = {Le Zhang and Steve Renals},
  TITLE = {Phone Recognition Analysis for Trajectory {HMM}},
  BOOKTITLE = {Proc. Interspeech 2006},
  YEAR = 2006,
  ADDRESS = {Pittsburgh, USA},
  MONTH = {September},
    The trajectory {HMM} has been shown to be useful for model-based speech
    synthesis where a smoothed trajectory is generated using temporal
    constraints imposed by dynamic features. To evaluate the performance of such
    model on an ASR task, we present a trajectory decoder based on tree search
    with delayed path merging. Experiment on a speaker-dependent phone
    recognition task using the MOCHA-TIMIT database shows that the MLE-trained
    trajectory model, while retaining attractive properties of being a proper
    generative model, tends to favour over-smoothed trajectory among competing
    hypothesises, and does not perform better than a conventional {HMM}. We use
    this to build an argument that models giving better fit on training data may
    suffer a reduction of discrimination by being too faithful to training data.
    This partially explains why alternative acoustic models that try to
    explicitly model temporal constraints do not achieve significant
    improvements in ASR.
  PDF = {http://www.cstr.ed.ac.uk/downloads/publications/2006/zhang-icslp2006.pdf},
  CATEGORIES = {asr}

  AUTHOR = {Le Zhang and Jingbo Zhu and Tianshun Yao},
  TITLE = {An Evaluation of Statistical Spam Filtering
  JOURNAL = {ACM Transactions on Asian Language Information
                  Processing (TALIP)},
  YEAR = 2004,
  VOLUME = 3,
  NUMBER = 4,
  PAGES = {243-269},
  MONTH = {December},
  This paper evaluates five supervised learning methods in the context of
      statistical spam filtering. We study the impact of different feature
      pruning methods and feature set sizes on each learner's performance using
      cost-sensitive measures. It is observed that the significance of feature
      selection varies greatly from classifier to classifier. In particular, we
      found Support Vector Machine, AdaBoost and Maximum Entropy Model are top
      performers in this evaluation, sharing similar characteristics: not
      sensitive to feature selection strategy, easily scalable to very high
      feature dimension and good performances across different datasets. In
      contrast, Naive Bayes, a commonly used classifier in spam filtering, is
      found to be sensitive to feature selection methods on small feature set,
  and fail to function well in scenarios where false positives are penalized
      heavily. The experiments also suggest that aggressive feature pruning
      should be avoided when building filters to be used in applications where
      legitimate mails are assigned a cost much higher than spams (such as
              $\lambda=999$), so as to maintain a better-than-baseline
      performance. An interesting finding is the effect of mail headers on spam
      filtering, which is often ignored in previous studies. Experiments show
      that classifiers using features from message header alone can achieve
      comparable or better performance than filters utilizing body features
      only. This suggests that message headers can be reliable and powerfully
      discriminative feature sources for spam filtering.},
  ZH1CORPUS = {http://homepages.inf.ed.ac.uk/s0450736/spam/zh1.tar.bz2},
  PS = {http://homepages.inf.ed.ac.uk/s0450736/paper/2004-spameval.ps.gz},
  PDF = {http://homepages.inf.ed.ac.uk/s0450736/paper/2004-spameval.pdf}

  AUTHOR = {Xueqiang L\"{U} and Le Zhang and Junfeng Hu},
  TITLE = {Statistical Substring Reduction in Linear Time},
  BOOKTITLE = {Proceeding of the 1st International Joint Conference
                  on Natural Language Processing (IJCNLP-04)},
  YEAR = 2004,
  ADDRESS = {Sanya, Hainan island, China},
  MONTH = {March},
  ABSTRACT = {We study the problem of efficiently removing equal
                  frequency ngram substrings from an ngram set,
                  formally called Statistical Substring Reduction
                  (SSR). SSR is a useful operation in corpus based
                  multi-word unit research and new word identification
                  task of oriental language processing. We present a
                  new SSR algorithm that has linear time ($O(n)$), and
                  prove its equivalence with the traditional $O(n^2)$
                  algorithm. In particular, using experimental results
                  from several corpora with different sizes, we show
                  that it is possible to achieve performance close to
                  that theoretically predicated for this task. Even in
                  a small corpus the new algorithm is several orders
                  of magnitude faster than the $O(n^2)$ one. These
                  results show that our algorithm is reliable and
                  efficient, and is therefore an appropriate choice
                  for large scale corpus processing.},
  PS = {http://homepages.inf.ed.ac.uk/s0450736/paper/2004_linearssr.ps.gz},
  PDF = {http://homepages.inf.ed.ac.uk/s0450736/paper/2004_linearssr.pdf},
  SOFTWARE = {http://homepages.inf.ed.ac.uk/s0450736/ngram.html},
  ERRATA = {http://homepages.inf.ed.ac.uk/s0450736/paper/2004_err_linearssr.txt}

  AUTHOR = {Le Zhang and Tianshun Yao},
  TITLE = {Filtering Junk Mail with a Maximum Entropy Model},
  BOOKTITLE = {Proceeding of 20th International Conference on
                  Computer Processing of Oriental Languages
  YEAR = 2003,
  PAGES = {446-453},
  The task of junk mail filtering is to rule out unsolicited bulk e-mail (junk)
      automatically from a user's mail stream. Two classes of methods have been
      shown to be useful for classifying e-mail messages. The rule based method
      uses a set of heuristic rules to classify e-mail messages while the
      statistical based approach models the difference of messages
      statistically, usually under a machine learning framework. Generally
      speaking, the statistical based methods are found to outperform the rule
      based method, yet we found, by combining different kinds of evidence used
      in the two approaches into a single statistical model, further improvement
      can be obtained. We present such a hybrid approach, utilizing a Maximum
      Entropy Model, and show how to use it in a junk mail filtering task. In
      particular, we present an extensive experimental comparison of our
      approach with a Naive Bayes classifier, a widely used classifier in e-mail
      filtering task, and show that this approach performs comparable or better
      than Naive Bayes method.
  PS = {http://homepages.inf.ed.ac.uk/s0450736/paper/junk.ps.gz},
  PDF = {http://homepages.inf.ed.ac.uk/s0450736/paper/junk.pdf},
  SLIDE = {http://homepages.inf.ed.ac.uk/s0450736/paper/junk-slide.pdf},
  SOFTWARE = {http://homepages.inf.ed.ac.uk/s0450736/maxent_toolkit.html}

  AUTHOR = {Le Zhang and Xueqiang L\"{U} and Yanna Shen and
                  Tianshun Yao},
  TITLE = {A Statistical Approach to Extract Chinese Chunk
                  Candidates from Large Corpora},
  BOOKTITLE = {Proceeding of 20th International Conference on
                  Computer Processing of Oriental Languages
  YEAR = 2003,
  PAGES = {109-117},
  The extraction of Chunk candidates from real corpora is one of the fundamental
      tasks of building example-based machine translation model. This paper
      presents a statistical approach to extract Chinese chunk candidates from
      large monolingual corpora. The first step is to extract large N-grams (up
              to 20-gram) from raw corpus. Then two newly proposed Fast
      Statistical Substring Reduction (FSSR) algorithms can be applied to the
      initial N-gram set to remove some unnecessary N-grams using their
      frequency information. The two algorithms are efficient (both have a time
              complexity of $O(n)$) and can effectively reduce the size of
      N-gram set up to 50\%. Finally, mutual information is used to obtain chunk
      candidates from reduced N-gram set. Perhaps the biggest contribution of
      this paper is that it is the first time to apply Fast Statistical
      Substring Reduction algorithm to large corpora and demonstrate the
      effectiveness and efficiency of this algorithm which, in our hope, will
      shed new light on large scale corpus oriented research. Experiments on
      three corpora with different sizes show that this method can extract chunk
      candidates from corpora of giga bytes efficiently under current
      computational power. We get an extraction accuracy of 86.3\% from People
      Daily 2000 news corpus. 
  PS = {http://homepages.inf.ed.ac.uk/s0450736/paper/extract_chunk.ps.gz},
  PDF = { http://homepages.inf.ed.ac.uk/s0450736/paper/extract_chunk.pdf},
  SLIDE = {http://homepages.inf.ed.ac.uk/s0450736/paper/extract_chunk_slide.pdf},
  SOFTWARE = {http://homepages.inf.ed.ac.uk/s0450736/ngram.html}

This file has been generated by bibtex2html 1.82.